[llvm] 595a088 - [AMDGPU] Add support for new LLVM vector types
Mateja Marjanovic via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 29 08:02:51 PST 2022
Author: Mateja Marjanovic
Date: 2022-11-29T17:02:04+01:00
New Revision: 595a08847a4b6e8d52c40715e2fa03e3d7f73189
URL: https://github.com/llvm/llvm-project/commit/595a08847a4b6e8d52c40715e2fa03e3d7f73189
DIFF: https://github.com/llvm/llvm-project/commit/595a08847a4b6e8d52c40715e2fa03e3d7f73189.diff
LOG: [AMDGPU] Add support for new LLVM vector types
Add VReg, AReg and SReg on AMDGPU for bit widths: 288, 320, 352 and 384.
Differential Revision: https://reviews.llvm.org/D138205
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
llvm/lib/Target/AMDGPU/MIMGInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
llvm/test/Analysis/CostModel/AMDGPU/fma.ll
llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
llvm/test/Analysis/CostModel/AMDGPU/mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
llvm/test/CodeGen/AMDGPU/function-returns.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
llvm/test/CodeGen/AMDGPU/kernel-args.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
llvm/test/CodeGen/AMDGPU/load-global-f32.ll
llvm/test/CodeGen/AMDGPU/load-global-i32.ll
llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
llvm/test/CodeGen/AMDGPU/select.f16.ll
llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
llvm/test/MC/AMDGPU/gfx1013.s
llvm/test/MC/AMDGPU/gfx1030_new.s
llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
llvm/test/MC/AMDGPU/gfx10_unsupported.s
llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index bfeee37feb4bd..d6a94c972340e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -34,12 +34,24 @@ enum PartialMappingIdx {
PM_SGPR96 = 23,
PM_VGPR96 = 24,
PM_AGPR96 = 25,
- PM_AGPR32 = 31,
- PM_AGPR64 = 32,
- PM_AGPR128 = 33,
- PM_AGPR256 = 34,
- PM_AGPR512 = 35,
- PM_AGPR1024 = 36
+ PM_SGPR288 = 26,
+ PM_VGPR288 = 27,
+ PM_AGPR288 = 28,
+ PM_SGPR320 = 29,
+ PM_VGPR320 = 30,
+ PM_AGPR320 = 31,
+ PM_SGPR352 = 32,
+ PM_VGPR352 = 33,
+ PM_AGPR352 = 34,
+ PM_SGPR384 = 35,
+ PM_VGPR384 = 36,
+ PM_AGPR384 = 37,
+ PM_AGPR32 = 38,
+ PM_AGPR64 = 39,
+ PM_AGPR128 = 40,
+ PM_AGPR256 = 41,
+ PM_AGPR512 = 42,
+ PM_AGPR1024 = 43
};
const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -66,6 +78,18 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
{0, 96, SGPRRegBank},
{0, 96, VGPRRegBank},
{0, 96, AGPRRegBank},
+ {0, 288, SGPRRegBank},
+ {0, 288, VGPRRegBank},
+ {0, 288, AGPRRegBank},
+ {0, 320, SGPRRegBank},
+ {0, 320, VGPRRegBank},
+ {0, 320, AGPRRegBank},
+ {0, 352, SGPRRegBank},
+ {0, 352, VGPRRegBank},
+ {0, 352, AGPRRegBank},
+ {0, 384, SGPRRegBank},
+ {0, 384, VGPRRegBank},
+ {0, 384, AGPRRegBank},
{0, 32, AGPRRegBank}, // AGPR begin
{0, 64, AGPRRegBank},
@@ -107,6 +131,18 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{&PartMappings[17], 1},
{&PartMappings[18], 1},
{&PartMappings[19], 1},
+ {&PartMappings[20], 1},
+ {&PartMappings[21], 1},
+ {&PartMappings[22], 1},
+ {&PartMappings[23], 1},
+ {&PartMappings[24], 1},
+ {&PartMappings[25], 1},
+ {&PartMappings[26], 1},
+ {&PartMappings[27], 1},
+ {&PartMappings[28], 1},
+ {&PartMappings[29], 1},
+ {&PartMappings[30], 1},
+ {&PartMappings[31], 1},
// AGPRs
{nullptr, 0},
@@ -114,12 +150,12 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{nullptr, 0},
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[20], 1}, // 32
- {&PartMappings[21], 1}, // 64
- {&PartMappings[22], 1}, // 128
- {&PartMappings[23], 1}, // 256
- {&PartMappings[24], 1}, // 512
- {&PartMappings[25], 1} // 1024
+ {&PartMappings[32], 1}, // 32
+ {&PartMappings[33], 1}, // 64
+ {&PartMappings[34], 1}, // 128
+ {&PartMappings[35], 1}, // 256
+ {&PartMappings[36], 1}, // 512
+ {&PartMappings[37], 1} // 1024
};
const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
@@ -148,7 +184,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
enum ValueMappingIdx {
SGPRStartIdx = 1,
VGPRStartIdx = 12,
- AGPRStartIdx = 26
+ AGPRStartIdx = 38
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
@@ -175,6 +211,62 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
default: llvm_unreachable("Invalid register bank");
}
break;
+ case 288:
+ switch (BankID) {
+ case AMDGPU::VGPRRegBankID:
+ Idx = PM_VGPR288;
+ break;
+ case AMDGPU::SGPRRegBankID:
+ Idx = PM_SGPR288;
+ break;
+ case AMDGPU::AGPRRegBankID:
+ Idx = PM_AGPR288;
+ break;
+ default: llvm_unreachable("Invalid register bank");
+ }
+ break;
+ case 320:
+ switch (BankID) {
+ case AMDGPU::VGPRRegBankID:
+ Idx = PM_VGPR320;
+ break;
+ case AMDGPU::SGPRRegBankID:
+ Idx = PM_SGPR320;
+ break;
+ case AMDGPU::AGPRRegBankID:
+ Idx = PM_AGPR320;
+ break;
+ default: llvm_unreachable("Invalid register bank");
+ }
+ break;
+ case 352:
+ switch (BankID) {
+ case AMDGPU::VGPRRegBankID:
+ Idx = PM_VGPR352;
+ break;
+ case AMDGPU::SGPRRegBankID:
+ Idx = PM_SGPR352;
+ break;
+ case AMDGPU::AGPRRegBankID:
+ Idx = PM_AGPR352;
+ break;
+ default: llvm_unreachable("Invalid register bank");
+ }
+ break;
+ case 384:
+ switch (BankID) {
+ case AMDGPU::VGPRRegBankID:
+ Idx = PM_VGPR384;
+ break;
+ case AMDGPU::SGPRRegBankID:
+ Idx = PM_SGPR384;
+ break;
+ case AMDGPU::AGPRRegBankID:
+ Idx = PM_AGPR384;
+ break;
+ default: llvm_unreachable("Invalid register bank");
+ }
+ break;
default:
switch (BankID) {
case AMDGPU::VGPRRegBankID:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fb4bd6b1a0bb6..e7fb4a76b6a29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -84,6 +84,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
+
+ setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
+
+ setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
+
+ setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
+
setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
@@ -196,6 +208,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+ setOperationAction(ISD::STORE, MVT::v9f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
+
+ setOperationAction(ISD::STORE, MVT::v10f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
+
+ setOperationAction(ISD::STORE, MVT::v11f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
+
+ setOperationAction(ISD::STORE, MVT::v12f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
+
setOperationAction(ISD::STORE, MVT::v16f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
@@ -325,19 +349,23 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f64, Expand);
setOperationAction(ISD::CONCAT_VECTORS,
- {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
- MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
- MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+ {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
+ MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+ MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
+ MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
+ MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
Custom);
setOperationAction(
ISD::EXTRACT_SUBVECTOR,
{MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
- MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16,
- MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
- MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64,
- MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
+ MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32,
+ MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
+ MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
+ MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
+ MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
+ MVT::v16f64, MVT::v16i64},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -384,7 +412,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
+ MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
@@ -404,7 +433,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
+ MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
for (MVT VT : FloatVectorTypes) {
setOperationAction(
@@ -440,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
+ setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
+
+ setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
+
+ setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
+
+ setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -1064,7 +1106,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
// Round up vec3/vec5 argument.
if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
assert(MemVT.getVectorNumElements() == 3 ||
- MemVT.getVectorNumElements() == 5);
+ MemVT.getVectorNumElements() == 5 ||
+ (MemVT.getVectorNumElements() >= 9 &&
+ MemVT.getVectorNumElements() <= 12));
MemVT = MemVT.getPow2VectorType(State.getContext());
} else if (!MemVT.isSimple() && !MemVT.isVector()) {
MemVT = MemVT.getRoundIntegerType(State.getContext());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 08a82d2f7a2d0..c148c322be151 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5446,7 +5446,7 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
Opcode = AMDGPU::getMIMGOpcode(
BaseOpcodes[Is64][IsA16],
IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
- NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
+ NumVDataDwords, NumVAddrDwords);
}
assert(Opcode != -1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 50999a4802b39..e83e644d13f3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
+ [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_288, SReg_320, SReg_352, SReg_384, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
+ [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
def AGPRRegBank : RegisterBank <"AGPR",
- [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
+ [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 547ca652e57f5..01cb14714ad98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -431,6 +431,46 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
IsSGPR = false;
IsAGPR = true;
Width = 8;
+ } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 9;
+ } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 9;
+ } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 9;
+ } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 10;
+ } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 10;
+ } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 10;
+ } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 11;
+ } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 11;
+ } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 11;
+ } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 12;
+ } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 12;
+ } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
"trap handler registers should not be used");
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3c6050cba7d56..617b5bb2d859f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2360,6 +2360,14 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
return AMDGPU::VReg_224RegClassID;
case 256:
return AMDGPU::VReg_256RegClassID;
+ case 288:
+ return AMDGPU::VReg_288RegClassID;
+ case 320:
+ return AMDGPU::VReg_320RegClassID;
+ case 352:
+ return AMDGPU::VReg_352RegClassID;
+ case 384:
+ return AMDGPU::VReg_384RegClassID;
case 512:
return AMDGPU::VReg_512RegClassID;
case 1024:
@@ -2398,6 +2406,14 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
return AMDGPU::SGPR_224RegClassID;
case 256:
return AMDGPU::SGPR_256RegClassID;
+ case 288:
+ return AMDGPU::SGPR_288RegClassID;
+ case 320:
+ return AMDGPU::SGPR_320RegClassID;
+ case 352:
+ return AMDGPU::SGPR_352RegClassID;
+ case 384:
+ return AMDGPU::SGPR_384RegClassID;
case 512:
return AMDGPU::SGPR_512RegClassID;
}
@@ -2420,6 +2436,14 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
return AMDGPU::AReg_224RegClassID;
case 256:
return AMDGPU::AReg_256RegClassID;
+ case 288:
+ return AMDGPU::AReg_288RegClassID;
+ case 320:
+ return AMDGPU::AReg_320RegClassID;
+ case 352:
+ return AMDGPU::AReg_352RegClassID;
+ case 384:
+ return AMDGPU::AReg_384RegClassID;
case 512:
return AMDGPU::AReg_512RegClassID;
case 1024:
@@ -3684,7 +3708,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
if (!IsNSA) {
- if (ExpectedAddrSize > 8)
+ if (ExpectedAddrSize > 12)
ExpectedAddrSize = 16;
// Allow oversized 8 VGPR vaddr when only 5/6/7 VGPRs are required.
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ef084cf74a975..8cffbd973317f 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -129,6 +129,9 @@ DECODE_OPERAND_REG(VReg_64)
DECODE_OPERAND_REG(VReg_96)
DECODE_OPERAND_REG(VReg_128)
DECODE_OPERAND_REG(VReg_256)
+DECODE_OPERAND_REG(VReg_288)
+DECODE_OPERAND_REG(VReg_352)
+DECODE_OPERAND_REG(VReg_384)
DECODE_OPERAND_REG(VReg_512)
DECODE_OPERAND_REG(VReg_1024)
@@ -919,7 +922,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
if (!IsNSA) {
- if (AddrSize > 8)
+ if (AddrSize > 12)
AddrSize = 16;
} else {
if (AddrSize > Info->VAddrDwords) {
@@ -1129,6 +1132,14 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
case AMDGPU::TTMP_256RegClassID:
// ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
// this bundle?
+ case AMDGPU::SGPR_288RegClassID:
+ case AMDGPU::TTMP_288RegClassID:
+ case AMDGPU::SGPR_320RegClassID:
+ case AMDGPU::TTMP_320RegClassID:
+ case AMDGPU::SGPR_352RegClassID:
+ case AMDGPU::TTMP_352RegClassID:
+ case AMDGPU::SGPR_384RegClassID:
+ case AMDGPU::TTMP_384RegClassID:
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::TTMP_512RegClassID:
shift = 2;
@@ -1204,6 +1215,23 @@ MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const {
return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255);
}
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_288(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_288RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_320(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_320RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_352(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_352RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_384(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_384RegClassID, Val & 255);
+}
+
+
MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
}
@@ -1252,6 +1280,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_256RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_288(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_288RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_320(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_320RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_352(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_352RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_384(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_384RegClassID, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
}
@@ -1302,6 +1346,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
return decodeDstOp(OPW256, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_288(unsigned Val) const {
+ return decodeDstOp(OPW288, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_320(unsigned Val) const {
+ return decodeDstOp(OPW320, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_352(unsigned Val) const {
+ return decodeDstOp(OPW352, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_384(unsigned Val) const {
+ return decodeDstOp(OPW384, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
return decodeDstOp(OPW512, Val);
}
@@ -1460,6 +1520,10 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
case OPW128: return VReg_128RegClassID;
case OPW160: return VReg_160RegClassID;
case OPW256: return VReg_256RegClassID;
+ case OPW288: return VReg_288RegClassID;
+ case OPW320: return VReg_320RegClassID;
+ case OPW352: return VReg_352RegClassID;
+ case OPW384: return VReg_384RegClassID;
case OPW512: return VReg_512RegClassID;
case OPW1024: return VReg_1024RegClassID;
}
@@ -1481,6 +1545,10 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
case OPW128: return AReg_128RegClassID;
case OPW160: return AReg_160RegClassID;
case OPW256: return AReg_256RegClassID;
+ case OPW288: return AReg_288RegClassID;
+ case OPW320: return AReg_320RegClassID;
+ case OPW352: return AReg_352RegClassID;
+ case OPW384: return AReg_384RegClassID;
case OPW512: return AReg_512RegClassID;
case OPW1024: return AReg_1024RegClassID;
}
@@ -1503,6 +1571,10 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
case OPW128: return SGPR_128RegClassID;
case OPW160: return SGPR_160RegClassID;
case OPW256: return SGPR_256RegClassID;
+ case OPW288: return SGPR_288RegClassID;
+ case OPW320: return SGPR_320RegClassID;
+ case OPW352: return SGPR_352RegClassID;
+ case OPW384: return SGPR_384RegClassID;
case OPW512: return SGPR_512RegClassID;
}
}
@@ -1521,6 +1593,10 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
case OPWV232: return TTMP_64RegClassID;
case OPW128: return TTMP_128RegClassID;
case OPW256: return TTMP_256RegClassID;
+ case OPW288: return TTMP_288RegClassID;
+ case OPW320: return TTMP_320RegClassID;
+ case OPW352: return TTMP_352RegClassID;
+ case OPW384: return TTMP_384RegClassID;
case OPW512: return TTMP_512RegClassID;
}
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d0aef9cdf79da..b811d70bc108b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -182,6 +182,10 @@ class AMDGPUDisassembler : public MCDisassembler {
MCOperand decodeOperand_VReg_96(unsigned Val) const;
MCOperand decodeOperand_VReg_128(unsigned Val) const;
MCOperand decodeOperand_VReg_256(unsigned Val) const;
+ MCOperand decodeOperand_VReg_288(unsigned Val) const;
+ MCOperand decodeOperand_VReg_320(unsigned Val) const;
+ MCOperand decodeOperand_VReg_352(unsigned Val) const;
+ MCOperand decodeOperand_VReg_384(unsigned Val) const;
MCOperand decodeOperand_VReg_512(unsigned Val) const;
MCOperand decodeOperand_VReg_1024(unsigned Val) const;
@@ -193,12 +197,20 @@ class AMDGPUDisassembler : public MCDisassembler {
MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
MCOperand decodeOperand_SReg_128(unsigned Val) const;
MCOperand decodeOperand_SReg_256(unsigned Val) const;
+ MCOperand decodeOperand_SReg_288(unsigned Val) const;
+ MCOperand decodeOperand_SReg_320(unsigned Val) const;
+ MCOperand decodeOperand_SReg_352(unsigned Val) const;
+ MCOperand decodeOperand_SReg_384(unsigned Val) const;
MCOperand decodeOperand_SReg_512(unsigned Val) const;
MCOperand decodeOperand_AGPR_32(unsigned Val) const;
MCOperand decodeOperand_AReg_64(unsigned Val) const;
MCOperand decodeOperand_AReg_128(unsigned Val) const;
MCOperand decodeOperand_AReg_256(unsigned Val) const;
+ MCOperand decodeOperand_AReg_288(unsigned Val) const;
+ MCOperand decodeOperand_AReg_320(unsigned Val) const;
+ MCOperand decodeOperand_AReg_352(unsigned Val) const;
+ MCOperand decodeOperand_AReg_384(unsigned Val) const;
MCOperand decodeOperand_AReg_512(unsigned Val) const;
MCOperand decodeOperand_AReg_1024(unsigned Val) const;
MCOperand decodeOperand_AV_32(unsigned Val) const;
@@ -214,6 +226,10 @@ class AMDGPUDisassembler : public MCDisassembler {
OPW128,
OPW160,
OPW256,
+ OPW288,
+ OPW320,
+ OPW352,
+ OPW384,
OPW512,
OPW1024,
OPW16,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 05588cfc45241..d3aee55c5f35c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -503,6 +503,10 @@ void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 533af659116ee..d018fd88b71a1 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -958,7 +958,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> {
!if(!eq(NumWords, 6), VReg_192,
!if(!eq(NumWords, 7), VReg_224,
!if(!le(NumWords, 8), VReg_256,
- !if(!le(NumWords, 16), VReg_512, ?))))))))));
+ !if(!le(NumWords, 9), VReg_288,
+ !if(!le(NumWords, 10), VReg_320,
+ !if(!le(NumWords, 11), VReg_352,
+ !if(!le(NumWords, 12), VReg_384,
+ !if(!le(NumWords, 16), VReg_512, ?))))))))))))));
// Whether the instruction variant with this vaddr size should be enabled for
// the auto-generated disassembler.
@@ -1007,8 +1011,8 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> {
!foreach(range,
// V4 is generated for V3 and V4
// V8 is generated for V5 through V8
- // V16 is generated for V9 through V16
- [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9,16]],
+ // V16 is generated for V13 through V16
+ [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9],[10],[11],[12],[13,16]],
MIMGAddrSizes_dw_range<range>),
lhs, dw,
!if(isRangeInList<dw.Min, dw.Max, AllNumAddrWords>.ret,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 198dee022532c..1cba1e446ed44 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -120,6 +120,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
+ addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
+ addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
+
+ addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
+ addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
+
+ addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
+ addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
+
+ addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
+ addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
+
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
@@ -158,15 +170,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD,
- {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
- MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
- MVT::v32i32},
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
+ MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
+ MVT::i1, MVT::v32i32},
Custom);
setOperationAction(ISD::STORE,
- {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
- MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
- MVT::v32i32},
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
+ MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
+ MVT::i1, MVT::v32i32},
Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
@@ -209,12 +223,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
setOperationAction(ISD::TRUNCATE,
- {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
- MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32},
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
+ MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
Expand);
setOperationAction(ISD::FP_ROUND,
- {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
- MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32},
+ {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
+ MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
+ MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG,
@@ -240,11 +256,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT :
- {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64,
- MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64,
- MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64,
- MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
- MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
+ {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
+ MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
+ MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
+ MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
+ MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
+ MVT::v32i32, MVT::v32f32}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -365,8 +383,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Deal with vec5/6/7 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR,
- {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
- MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+ {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+ MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
+ MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
+ MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
@@ -4235,6 +4255,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::SI_INDIRECT_SRC_V2:
case AMDGPU::SI_INDIRECT_SRC_V4:
case AMDGPU::SI_INDIRECT_SRC_V8:
+ case AMDGPU::SI_INDIRECT_SRC_V9:
+ case AMDGPU::SI_INDIRECT_SRC_V10:
+ case AMDGPU::SI_INDIRECT_SRC_V11:
+ case AMDGPU::SI_INDIRECT_SRC_V12:
case AMDGPU::SI_INDIRECT_SRC_V16:
case AMDGPU::SI_INDIRECT_SRC_V32:
return emitIndirectSrc(MI, *BB, *getSubtarget());
@@ -4242,6 +4266,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
+ case AMDGPU::SI_INDIRECT_DST_V9:
+ case AMDGPU::SI_INDIRECT_DST_V10:
+ case AMDGPU::SI_INDIRECT_DST_V11:
+ case AMDGPU::SI_INDIRECT_DST_V12:
case AMDGPU::SI_INDIRECT_DST_V16:
case AMDGPU::SI_INDIRECT_DST_V32:
return emitIndirectDst(MI, *BB, *getSubtarget());
@@ -6185,7 +6213,7 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
MVT Type;
unsigned NumElts = Elts.size();
- if (NumElts <= 8) {
+ if (NumElts <= 12) {
Type = MVT::getVectorVT(MVT::f32, NumElts);
} else {
assert(Elts.size() <= 16);
@@ -7735,7 +7763,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
: AMDGPU::MIMGEncGfx10Default,
- NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
+ NumVDataDwords, NumVAddrDwords);
}
assert(Opcode != -1);
@@ -7801,13 +7829,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (!UseNSA) {
// Build a single vector containing all the operands so far prepared.
- if (NumVAddrDwords > 8) {
+ if (NumVAddrDwords > 12) {
SDValue Undef = DAG.getUNDEF(MVT::i32);
Ops.append(16 - Ops.size(), Undef);
}
- assert(Ops.size() == 8 || Ops.size() == 16);
+ assert(Ops.size() >= 8 && Ops.size() <= 12);
SDValue MergedOps = DAG.getBuildVector(
- Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
+ MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
Ops.clear();
Ops.push_back(MergedOps);
}
@@ -12466,6 +12494,14 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
return AMDGPU::VReg_224_Align2RegClassID;
case AMDGPU::VReg_256RegClassID:
return AMDGPU::VReg_256_Align2RegClassID;
+ case AMDGPU::VReg_288RegClassID:
+ return AMDGPU::VReg_288_Align2RegClassID;
+ case AMDGPU::VReg_320RegClassID:
+ return AMDGPU::VReg_320_Align2RegClassID;
+ case AMDGPU::VReg_352RegClassID:
+ return AMDGPU::VReg_352_Align2RegClassID;
+ case AMDGPU::VReg_384RegClassID:
+ return AMDGPU::VReg_384_Align2RegClassID;
case AMDGPU::VReg_512RegClassID:
return AMDGPU::VReg_512_Align2RegClassID;
case AMDGPU::VReg_1024RegClassID:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9da08ffaa08ed..6746a9174550e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1300,6 +1300,14 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
if (VecSize <= 256) // 32 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
+ if (VecSize <= 288) // 36 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
+ if (VecSize <= 320) // 40 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
+ if (VecSize <= 352) // 44 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
+ if (VecSize <= 384) // 48 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
if (VecSize <= 512) // 64 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
if (VecSize <= 1024) // 128 bytes
@@ -1320,6 +1328,14 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
if (VecSize <= 256) // 32 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
+ if (VecSize <= 288) // 36 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
+ if (VecSize <= 320) // 40 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
+ if (VecSize <= 352) // 44 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
+ if (VecSize <= 384) // 48 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
if (VecSize <= 512) // 64 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
if (VecSize <= 1024) // 128 bytes
@@ -1341,6 +1357,14 @@ static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
if (VecSize <= 256) // 32 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
+ if (VecSize <= 288) // 36 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
+ if (VecSize <= 320) // 40 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
+ if (VecSize <= 352) // 44 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
+ if (VecSize <= 384) // 48 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
if (VecSize <= 512) // 64 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
if (VecSize <= 1024) // 128 bytes
@@ -1421,6 +1445,14 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
+ case 36:
+ return AMDGPU::SI_SPILL_S288_SAVE;
+ case 40:
+ return AMDGPU::SI_SPILL_S320_SAVE;
+ case 44:
+ return AMDGPU::SI_SPILL_S352_SAVE;
+ case 48:
+ return AMDGPU::SI_SPILL_S384_SAVE;
case 64:
return AMDGPU::SI_SPILL_S512_SAVE;
case 128:
@@ -1448,6 +1480,14 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V224_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
+ case 36:
+ return AMDGPU::SI_SPILL_S288_SAVE;
+ case 40:
+ return AMDGPU::SI_SPILL_S320_SAVE;
+ case 44:
+ return AMDGPU::SI_SPILL_S352_SAVE;
+ case 48:
+ return AMDGPU::SI_SPILL_S384_SAVE;
case 64:
return AMDGPU::SI_SPILL_V512_SAVE;
case 128:
@@ -1588,6 +1628,14 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
+ case 36:
+ return AMDGPU::SI_SPILL_S288_RESTORE;
+ case 40:
+ return AMDGPU::SI_SPILL_S320_RESTORE;
+ case 44:
+ return AMDGPU::SI_SPILL_S352_RESTORE;
+ case 48:
+ return AMDGPU::SI_SPILL_S384_RESTORE;
case 64:
return AMDGPU::SI_SPILL_S512_RESTORE;
case 128:
@@ -1615,6 +1663,14 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
+ case 36:
+ return AMDGPU::SI_SPILL_V288_RESTORE;
+ case 40:
+ return AMDGPU::SI_SPILL_V320_RESTORE;
+ case 44:
+ return AMDGPU::SI_SPILL_V352_RESTORE;
+ case 48:
+ return AMDGPU::SI_SPILL_V384_RESTORE;
case 64:
return AMDGPU::SI_SPILL_V512_RESTORE;
case 128:
@@ -1642,6 +1698,14 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_A224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_A256_RESTORE;
+ case 36:
+ return AMDGPU::SI_SPILL_A288_RESTORE;
+ case 40:
+ return AMDGPU::SI_SPILL_A320_RESTORE;
+ case 44:
+ return AMDGPU::SI_SPILL_A352_RESTORE;
+ case 48:
+ return AMDGPU::SI_SPILL_A384_RESTORE;
case 64:
return AMDGPU::SI_SPILL_A512_RESTORE;
case 128:
@@ -1669,6 +1733,14 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_AV224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_AV256_RESTORE;
+ case 36:
+ return AMDGPU::SI_SPILL_AV288_RESTORE;
+ case 40:
+ return AMDGPU::SI_SPILL_AV320_RESTORE;
+ case 44:
+ return AMDGPU::SI_SPILL_AV352_RESTORE;
+ case 48:
+ return AMDGPU::SI_SPILL_AV384_RESTORE;
case 64:
return AMDGPU::SI_SPILL_AV512_RESTORE;
case 128:
@@ -1974,6 +2046,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
@@ -2025,6 +2101,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
assert(ST.useVGPRIndexMode());
@@ -2064,6 +2144,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
assert(ST.useVGPRIndexMode());
@@ -4531,7 +4615,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
} else {
const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
- if (AddrWords > 8)
+ if (AddrWords > 12)
AddrWords = 16;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0516547e179c3..6c9f38c250f08 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -650,6 +650,10 @@ def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>;
+def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>;
+def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>;
+def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>;
def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
@@ -657,6 +661,10 @@ def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>;
+def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>;
+def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>;
+def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
@@ -698,6 +706,10 @@ def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
@@ -735,6 +747,10 @@ def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VR
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
@@ -751,6 +767,10 @@ def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg
def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
@@ -784,6 +804,10 @@ defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>;
+defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>;
+defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>;
+defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -828,6 +852,10 @@ defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>;
+defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>;
+defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>;
+defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -839,6 +867,10 @@ defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
+defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>;
+defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>;
+defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>;
+defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
@@ -850,6 +882,10 @@ defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
+defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>;
+defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>;
+defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>;
+defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
@@ -1225,6 +1261,70 @@ foreach Index = 0-7 in {
>;
}
+foreach Index = 0-8 in {
+ def Extract_Element_v9i32_#Index : Extract_Element <
+ i32, v9i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v9i32_#Index : Insert_Element <
+ i32, v9i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v9f32_#Index : Extract_Element <
+ f32, v9f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v9f32_#Index : Insert_Element <
+ f32, v9f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-9 in {
+ def Extract_Element_v10i32_#Index : Extract_Element <
+ i32, v10i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v10i32_#Index : Insert_Element <
+ i32, v10i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v10f32_#Index : Extract_Element <
+ f32, v10f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v10f32_#Index : Insert_Element <
+ f32, v10f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-10 in {
+ def Extract_Element_v11i32_#Index : Extract_Element <
+ i32, v11i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v11i32_#Index : Insert_Element <
+ i32, v11i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v11f32_#Index : Extract_Element <
+ f32, v11f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v11f32_#Index : Insert_Element <
+ f32, v11f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-11 in {
+ def Extract_Element_v12i32_#Index : Extract_Element <
+ i32, v12i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v12i32_#Index : Insert_Element <
+ i32, v12i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v12f32_#Index : Extract_Element <
+ f32, v12f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v12f32_#Index : Insert_Element <
+ f32, v12f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-15 in {
def Extract_Element_v16i32_#Index : Extract_Element <
i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1482,6 +1582,30 @@ def : BitConvert <v4i64, v16i16, VReg_256>;
def : BitConvert <v4f64, v16f16, VReg_256>;
def : BitConvert <v4f64, v16i16, VReg_256>;
+// 288-bit bitcast
+def : BitConvert <v9i32, v9f32, SReg_288>;
+def : BitConvert <v9f32, v9i32, SReg_288>;
+def : BitConvert <v9i32, v9f32, VReg_288>;
+def : BitConvert <v9f32, v9i32, VReg_288>;
+
+// 320-bit bitcast
+def : BitConvert <v10i32, v10f32, SReg_320>;
+def : BitConvert <v10f32, v10i32, SReg_320>;
+def : BitConvert <v10i32, v10f32, VReg_320>;
+def : BitConvert <v10f32, v10i32, VReg_320>;
+
+// 320-bit bitcast
+def : BitConvert <v11i32, v11f32, SReg_352>;
+def : BitConvert <v11f32, v11i32, SReg_352>;
+def : BitConvert <v11i32, v11f32, VReg_352>;
+def : BitConvert <v11f32, v11i32, VReg_352>;
+
+// 384-bit bitcast
+def : BitConvert <v12i32, v12f32, SReg_384>;
+def : BitConvert <v12f32, v12i32, SReg_384>;
+def : BitConvert <v12i32, v12f32, VReg_384>;
+def : BitConvert <v12f32, v12i32, VReg_384>;
+
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
@@ -2022,12 +2146,20 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">;
+defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">;
+defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">;
+defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">;
defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">;
+defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">;
+defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">;
+defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">;
defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d031c2808adcd..7384f1d083c4a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2449,6 +2449,14 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::VReg_288RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::VReg_320RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::VReg_352RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::VReg_384RegClass;
if (BitWidth <= 512)
return &AMDGPU::VReg_512RegClass;
if (BitWidth <= 1024)
@@ -2473,6 +2481,14 @@ getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_224_Align2RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256_Align2RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::VReg_288_Align2RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::VReg_320_Align2RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::VReg_352_Align2RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::VReg_384_Align2RegClass;
if (BitWidth <= 512)
return &AMDGPU::VReg_512_Align2RegClass;
if (BitWidth <= 1024)
@@ -2509,6 +2525,14 @@ getAnyAGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::AReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::AReg_256RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::AReg_288RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::AReg_320RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::AReg_352RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::AReg_384RegClass;
if (BitWidth <= 512)
return &AMDGPU::AReg_512RegClass;
if (BitWidth <= 1024)
@@ -2533,6 +2557,14 @@ getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::AReg_224_Align2RegClass;
if (BitWidth <= 256)
return &AMDGPU::AReg_256_Align2RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::AReg_288_Align2RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::AReg_320_Align2RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::AReg_352_Align2RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::AReg_384_Align2RegClass;
if (BitWidth <= 512)
return &AMDGPU::AReg_512_Align2RegClass;
if (BitWidth <= 1024)
@@ -2567,6 +2599,14 @@ getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::AV_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::AV_256RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::AV_288RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::AV_320RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::AV_352RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::AV_384RegClass;
if (BitWidth <= 512)
return &AMDGPU::AV_512RegClass;
if (BitWidth <= 1024)
@@ -2591,6 +2631,14 @@ getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::AV_224_Align2RegClass;
if (BitWidth <= 256)
return &AMDGPU::AV_256_Align2RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::AV_288_Align2RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::AV_320_Align2RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::AV_352_Align2RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::AV_384_Align2RegClass;
if (BitWidth <= 512)
return &AMDGPU::AV_512_Align2RegClass;
if (BitWidth <= 1024)
@@ -2630,6 +2678,14 @@ SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::SGPR_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::SGPR_256RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::SGPR_288RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::SGPR_320RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::SGPR_352RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::SGPR_384RegClass;
if (BitWidth <= 512)
return &AMDGPU::SGPR_512RegClass;
if (BitWidth <= 1024)
@@ -2686,6 +2742,26 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
&AMDGPU::SReg_256RegClass,
&AMDGPU::AReg_256_Align2RegClass,
&AMDGPU::AReg_256RegClass,
+ &AMDGPU::VReg_288_Align2RegClass,
+ &AMDGPU::VReg_288RegClass,
+ &AMDGPU::SReg_288RegClass,
+ &AMDGPU::AReg_288_Align2RegClass,
+ &AMDGPU::AReg_288RegClass,
+ &AMDGPU::VReg_320_Align2RegClass,
+ &AMDGPU::VReg_320RegClass,
+ &AMDGPU::SReg_320RegClass,
+ &AMDGPU::AReg_320_Align2RegClass,
+ &AMDGPU::AReg_320RegClass,
+ &AMDGPU::VReg_352_Align2RegClass,
+ &AMDGPU::VReg_352RegClass,
+ &AMDGPU::SReg_352RegClass,
+ &AMDGPU::AReg_352_Align2RegClass,
+ &AMDGPU::AReg_352RegClass,
+ &AMDGPU::VReg_384_Align2RegClass,
+ &AMDGPU::VReg_384RegClass,
+ &AMDGPU::SReg_384RegClass,
+ &AMDGPU::AReg_384_Align2RegClass,
+ &AMDGPU::AReg_384RegClass,
&AMDGPU::VReg_512_Align2RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index b1e8761fb05e9..21b7c28eed66d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -60,6 +60,16 @@ class getSubRegs<int size> {
list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
+ list<SubRegIndex> ret9 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8];
+ list<SubRegIndex> ret10 = [sub0, sub1, sub2, sub3,
+ sub4, sub5, sub6, sub7,
+ sub8, sub9];
+ list<SubRegIndex> ret11 = [sub0, sub1, sub2, sub3,
+ sub4, sub5, sub6, sub7,
+ sub8, sub9, sub10];
+ list<SubRegIndex> ret12 = [sub0, sub1, sub2, sub3,
+ sub4, sub5, sub6, sub7,
+ sub8, sub9, sub10, sub11];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
sub8, sub9, sub10, sub11,
@@ -80,8 +90,12 @@ class getSubRegs<int size> {
!if(!eq(size, 6), ret6,
!if(!eq(size, 7), ret7,
!if(!eq(size, 8), ret8,
- !if(!eq(size, 16), ret16,
- ret32))))))));
+ !if(!eq(size, 9), ret9,
+ !if(!eq(size, 10), ret10,
+ !if(!eq(size, 11), ret11,
+ !if(!eq(size, 12), ret12,
+ !if(!eq(size, 16), ret16,
+ ret32))))))))))));
}
// Generates list of sequential register tuple names.
@@ -423,6 +437,18 @@ def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
// SGPR 256-bit registers
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
+// SGPR 288-bit registers. No operations use these, but for symmetry with 288-bit VGPRs.
+def SGPR_288Regs : SIRegisterTuples<getSubRegs<9>.ret, SGPR_32, 105, 4, 9, "s">;
+
+// SGPR 320-bit registers. No operations use these, but for symmetry with 320-bit VGPRs.
+def SGPR_320Regs : SIRegisterTuples<getSubRegs<10>.ret, SGPR_32, 105, 4, 10, "s">;
+
+// SGPR 352-bit registers. No operations use these, but for symmetry with 352-bit VGPRs.
+def SGPR_352Regs : SIRegisterTuples<getSubRegs<11>.ret, SGPR_32, 105, 4, 11, "s">;
+
+// SGPR 384-bit registers. No operations use these, but for symmetry with 384-bit VGPRs.
+def SGPR_384Regs : SIRegisterTuples<getSubRegs<12>.ret, SGPR_32, 105, 4, 12, "s">;
+
// SGPR 512-bit registers
def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">;
@@ -465,6 +491,18 @@ def TTMP_224Regs : SIRegisterTuples<getSubRegs<7>.ret, TTMP_32, 15, 4, 7, "ttmp"
// Trap handler TMP 256-bit registers
def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
+// Trap handler TMP 288-bit registers
+def TTMP_288Regs : SIRegisterTuples<getSubRegs<9>.ret, TTMP_32, 15, 4, 9, "ttmp">;
+
+// Trap handler TMP 320-bit registers
+def TTMP_320Regs : SIRegisterTuples<getSubRegs<10>.ret, TTMP_32, 15, 4, 10, "ttmp">;
+
+// Trap handler TMP 352-bit registers
+def TTMP_352Regs : SIRegisterTuples<getSubRegs<11>.ret, TTMP_32, 15, 4, 11, "ttmp">;
+
+// Trap handler TMP 384-bit registers
+def TTMP_384Regs : SIRegisterTuples<getSubRegs<12>.ret, TTMP_32, 15, 4, 12, "ttmp">;
+
// Trap handler TMP 512-bit registers
def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
@@ -609,6 +647,18 @@ def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
// VGPR 256-bit registers
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
+// VGPR 288-bit registers
+def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 255, 1, 9, "v">;
+
+// VGPR 320-bit registers
+def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 255, 1, 10, "v">;
+
+// VGPR 352-bit registers
+def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 255, 1, 11, "v">;
+
+// VGPR 384-bit registers
+def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 255, 1, 12, "v">;
+
// VGPR 512-bit registers
def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
@@ -653,6 +703,18 @@ def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
// AGPR 256-bit registers
def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
+// AGPR 288-bit registers
+def AGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, AGPR_32, 255, 1, 9, "a">;
+
+// AGPR 320-bit registers
+def AGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, AGPR_32, 255, 1, 10, "a">;
+
+// AGPR 352-bit registers
+def AGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, AGPR_32, 255, 1, 11, "a">;
+
+// AGPR 384-bit registers
+def AGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, AGPR_32, 255, 1, 12, "a">;
+
// AGPR 512-bit registers
def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
@@ -829,6 +891,10 @@ defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
+defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
+defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
+defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
let GlobalPriority = true in {
defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
@@ -873,6 +939,10 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
+defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
+defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
+defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
+defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
let GlobalPriority = true in {
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
@@ -897,6 +967,10 @@ defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+defm AReg_288 : ARegClass<9, [v9i32, v9f32], (add AGPR_288)>;
+defm AReg_320 : ARegClass<10, [v10i32, v10f32], (add AGPR_320)>;
+defm AReg_352 : ARegClass<11, [v11i32, v11f32], (add AGPR_352)>;
+defm AReg_384 : ARegClass<12, [v12i32, v12f32], (add AGPR_384)>;
let GlobalPriority = true in {
defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
@@ -963,6 +1037,10 @@ defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>;
defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>;
defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>;
+defm AV_288 : AVRegClass<9, VReg_288.RegTypes, (add VGPR_288), (add AGPR_288)>;
+defm AV_320 : AVRegClass<10, VReg_320.RegTypes, (add VGPR_320), (add AGPR_320)>;
+defm AV_352 : AVRegClass<11, VReg_352.RegTypes, (add VGPR_352), (add AGPR_352)>;
+defm AV_384 : AVRegClass<12, VReg_384.RegTypes, (add VGPR_384), (add AGPR_384)>;
let GlobalPriority = true in {
defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 0e01d93e11009..2b10be1c8b3c6 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -292,6 +292,14 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
RC = &AMDGPU::VReg_224RegClass;
} else if (Info->VAddrDwords == 8) {
RC = &AMDGPU::VReg_256RegClass;
+ } else if (Info->VAddrDwords == 9) {
+ RC = &AMDGPU::VReg_288RegClass;
+ } else if (Info->VAddrDwords == 10) {
+ RC = &AMDGPU::VReg_320RegClass;
+ } else if (Info->VAddrDwords == 11) {
+ RC = &AMDGPU::VReg_352RegClass;
+ } else if (Info->VAddrDwords == 12) {
+ RC = &AMDGPU::VReg_384RegClass;
} else {
RC = &AMDGPU::VReg_512RegClass;
NewAddrDwords = 16;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 8d622f124d9aa..ce2f7f0642be1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2203,6 +2203,42 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AV_256RegClassID:
case AMDGPU::AV_256_Align2RegClassID:
return 256;
+ case AMDGPU::SGPR_288RegClassID:
+ case AMDGPU::SReg_288RegClassID:
+ case AMDGPU::VReg_288RegClassID:
+ case AMDGPU::AReg_288RegClassID:
+ case AMDGPU::VReg_288_Align2RegClassID:
+ case AMDGPU::AReg_288_Align2RegClassID:
+ case AMDGPU::AV_288RegClassID:
+ case AMDGPU::AV_288_Align2RegClassID:
+ return 288;
+ case AMDGPU::SGPR_320RegClassID:
+ case AMDGPU::SReg_320RegClassID:
+ case AMDGPU::VReg_320RegClassID:
+ case AMDGPU::AReg_320RegClassID:
+ case AMDGPU::VReg_320_Align2RegClassID:
+ case AMDGPU::AReg_320_Align2RegClassID:
+ case AMDGPU::AV_320RegClassID:
+ case AMDGPU::AV_320_Align2RegClassID:
+ return 320;
+ case AMDGPU::SGPR_352RegClassID:
+ case AMDGPU::SReg_352RegClassID:
+ case AMDGPU::VReg_352RegClassID:
+ case AMDGPU::AReg_352RegClassID:
+ case AMDGPU::VReg_352_Align2RegClassID:
+ case AMDGPU::AReg_352_Align2RegClassID:
+ case AMDGPU::AV_352RegClassID:
+ case AMDGPU::AV_352_Align2RegClassID:
+ return 352;
+ case AMDGPU::SGPR_384RegClassID:
+ case AMDGPU::SReg_384RegClassID:
+ case AMDGPU::VReg_384RegClassID:
+ case AMDGPU::AReg_384RegClassID:
+ case AMDGPU::VReg_384_Align2RegClassID:
+ case AMDGPU::AReg_384_Align2RegClassID:
+ case AMDGPU::AV_384RegClassID:
+ case AMDGPU::AV_384_Align2RegClassID:
+ return 384;
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 5e53c25d1207c..2bebc5ed9b53b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @add_i32() #0 {
; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v6i32 = add <6 x i32> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v7i32 = add <7 x i32> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = add <8 x i32> undef, undef
-; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9i32 = add <9 x i32> undef, undef
+; ALL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9i32 = add <9 x i32> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; ALL-SIZE-LABEL: 'add_i32'
@@ -27,7 +27,7 @@ define amdgpu_kernel void @add_i32() #0 {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v6i32 = add <6 x i32> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v7i32 = add <7 x i32> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = add <8 x i32> undef, undef
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9i32 = add <9 x i32> undef, undef
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9i32 = add <9 x i32> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%i32 = add i32 undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index 3dbcbf2cfb1c4..b57f26cdc2928 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -50,7 +50,7 @@ define i32 @add(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -79,7 +79,7 @@ define i32 @add(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -108,7 +108,7 @@ define i32 @add(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -137,7 +137,7 @@ define i32 @add(i32 %arg) {
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -230,7 +230,7 @@ define i32 @sub(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -259,7 +259,7 @@ define i32 @sub(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -288,7 +288,7 @@ define i32 @sub(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -317,7 +317,7 @@ define i32 @sub(i32 %arg) {
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
index 2d074ee9f151f..b1ff4a4a0acb1 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
@@ -50,7 +50,7 @@ define i32 @add(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -79,7 +79,7 @@ define i32 @add(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -108,7 +108,7 @@ define i32 @add(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -137,7 +137,7 @@ define i32 @add(i32 %arg) {
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -230,7 +230,7 @@ define i32 @sub(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -259,7 +259,7 @@ define i32 @sub(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -288,7 +288,7 @@ define i32 @sub(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -317,7 +317,7 @@ define i32 @sub(i32 %arg) {
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index f18d55e4ef3d1..d22d8a98b4a43 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; NOPACKEDF32-LABEL: 'fadd_f32'
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32'
@@ -35,7 +35,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; NOPACKEDF32-SIZE-LABEL: 'fadd_f32'
@@ -45,7 +45,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = fadd float undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index ca9754d1c849b..11ce416b7fd79 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @fdiv_f32_ieee() #0 {
; ALL-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = fdiv <4 x float> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v5f32 = fdiv <5 x float> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v9f32 = fdiv <9 x float> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; ALL-SIZE-LABEL: 'fdiv_f32_ieee'
@@ -30,7 +30,7 @@ define amdgpu_kernel void @fdiv_f32_ieee() #0 {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f32 = fdiv <4 x float> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v5f32 = fdiv <5 x float> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %v9f32 = fdiv <9 x float> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = fdiv float undef, undef
@@ -51,7 +51,7 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 {
; ALL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f32 = fdiv <4 x float> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v5f32 = fdiv <5 x float> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %v9f32 = fdiv <9 x float> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; ALL-SIZE-LABEL: 'fdiv_f32_ftzdaz'
@@ -61,7 +61,7 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = fdiv <4 x float> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v5f32 = fdiv <5 x float> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v9f32 = fdiv <9 x float> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = fdiv float undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index 9ed9e700284c6..a9f1210a598f0 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @fma_f32() #0 {
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOWF64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; FASTF64-LABEL: 'fma_f32'
@@ -27,7 +27,7 @@ define amdgpu_kernel void @fma_f32() #0 {
; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; FASTF64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOW-LABEL: 'fma_f32'
@@ -37,7 +37,7 @@ define amdgpu_kernel void @fma_f32() #0 {
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOWF64-SIZE-LABEL: 'fma_f32'
@@ -47,7 +47,7 @@ define amdgpu_kernel void @fma_f32() #0 {
; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; FASTF64-SIZE-LABEL: 'fma_f32'
@@ -57,7 +57,7 @@ define amdgpu_kernel void @fma_f32() #0 {
; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SLOW-SIZE-LABEL: 'fma_f32'
@@ -67,7 +67,7 @@ define amdgpu_kernel void @fma_f32() #0 {
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 29528136decd6..c8dab09e0dbf7 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; F32-LABEL: 'fmul_f32'
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef
; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef
; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; F32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef
; F32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX90A-SIZE-LABEL: 'fmul_f32'
@@ -35,7 +35,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'fmul_f32'
@@ -45,7 +45,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef
; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef
; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = fmul float undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 0142e55ce35bf..b3bf580e75e66 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; NOPACKEDF32-LABEL: 'fsub_f32'
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef
; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32'
@@ -35,7 +35,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef
; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; NOPACKEDF32-SIZE-LABEL: 'fsub_f32'
@@ -45,7 +45,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef
; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = fsub float undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index de9ba82bc2380..1444db7248330 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @mul_i32() #0 {
; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32 = mul <4 x i32> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5i32 = mul <5 x i32> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32 = mul <8 x i32> undef, undef
-; ALL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9i32 = mul <9 x i32> undef, undef
+; ALL-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9i32 = mul <9 x i32> undef, undef
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; ALL-SIZE-LABEL: 'mul_i32'
@@ -23,7 +23,7 @@ define amdgpu_kernel void @mul_i32() #0 {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = mul <4 x i32> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5i32 = mul <5 x i32> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = mul <8 x i32> undef, undef
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9i32 = mul <9 x i32> undef, undef
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9i32 = mul <9 x i32> undef, undef
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%i32 = mul i32 undef, undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index b668c42e99fe2..72fdd481dbb67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -2671,6 +2671,1208 @@ entry:
ret void
}
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_s_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: v_mov_b32_e32 v9, v0
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT: s_set_gpr_idx_on s11, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v9
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v9f32_s_v_s:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_mov_b32 m0, s11
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v6, s6
+; GFX10-NEXT: v_mov_b32_e32 v7, s7
+; GFX10-NEXT: v_mov_b32_e32 v8, s8
+; GFX10-NEXT: v_movreld_b32_e32 v0, v9
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v9f32_s_v_s:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: s_mov_b32 m0, s11
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT: v_movreld_b32_e32 v0, v9
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+ ret <9 x float> %insert
+}
+
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_v(<9 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_s_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v18, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v10, s0
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v11, s1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v12, s2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v13, s3
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v14, s4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v15, s5
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v16, s6
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v17, s7
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT: v_mov_b32_e32 v1, v9
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v9f32_s_v_v:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: v_mov_b32_e32 v18, s8
+; GFX10-NEXT: v_mov_b32_e32 v10, s0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_mov_b32_e32 v11, s1
+; GFX10-NEXT: v_mov_b32_e32 v12, s2
+; GFX10-NEXT: v_mov_b32_e32 v13, s3
+; GFX10-NEXT: v_mov_b32_e32 v14, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT: v_mov_b32_e32 v15, s5
+; GFX10-NEXT: v_mov_b32_e32 v16, s6
+; GFX10-NEXT: v_mov_b32_e32 v17, s7
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v0, v10
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v9f32_s_v_v:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: v_dual_mov_b32 v18, s8 :: v_dual_mov_b32 v17, s7
+; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v13, s3
+; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v15, s5
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT: v_dual_mov_b32 v1, v9 :: v_dual_cndmask_b32 v8, v18, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+ ret <9 x float> %insert
+}
+
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_v_v_s(<9 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_v_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v9
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_s:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: s_mov_b32 m0, s2
+; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v9
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+ ret <9 x float> %insert
+}
+
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_v_v_v(<9 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_v_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v10
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_v:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v10
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+ ret <9 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_s_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: v_mov_b32_e32 v10, v0
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT: s_set_gpr_idx_on s12, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v10f32_s_v_s:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: v_mov_b32_e32 v10, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_mov_b32 m0, s12
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v6, s6
+; GFX10-NEXT: v_mov_b32_e32 v7, s7
+; GFX10-NEXT: v_mov_b32_e32 v8, s8
+; GFX10-NEXT: v_mov_b32_e32 v9, s9
+; GFX10-NEXT: v_movreld_b32_e32 v0, v10
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v10f32_s_v_s:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v10, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: s_mov_b32 m0, s12
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT: v_movreld_b32_e32 v0, v10
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+ ret <10 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_v(<10 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_s_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: v_mov_b32_e32 v19, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v10, s0
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v11, s1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v12, s2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v13, s3
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v14, s4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v15, s5
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v16, s6
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v17, s7
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v18, s8
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 8, v1
+; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v18, v0, s[0:1]
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v19, v0, vcc
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT: v_mov_b32_e32 v1, v11
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v10f32_s_v_v:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: v_mov_b32_e32 v19, s9
+; GFX10-NEXT: v_mov_b32_e32 v10, s0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_mov_b32_e32 v11, s1
+; GFX10-NEXT: v_mov_b32_e32 v12, s2
+; GFX10-NEXT: v_mov_b32_e32 v13, s3
+; GFX10-NEXT: v_mov_b32_e32 v14, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT: v_mov_b32_e32 v15, s5
+; GFX10-NEXT: v_mov_b32_e32 v16, s6
+; GFX10-NEXT: v_mov_b32_e32 v17, s7
+; GFX10-NEXT: v_mov_b32_e32 v18, s8
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v19, v0, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v0, v10
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v10f32_s_v_v:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: v_dual_mov_b32 v19, s9 :: v_dual_mov_b32 v18, s8
+; GFX11-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX11-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v11
+; GFX11-NEXT: v_dual_cndmask_b32 v9, v19, v0 :: v_dual_mov_b32 v0, v10
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+ ret <10 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_v_v_s(<10 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_v_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_s:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: s_mov_b32 m0, s2
+; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v10
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+ ret <10 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_v_v_v(<10 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_v_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v11
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_v:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v11
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+ ret <10 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_s_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: v_mov_b32_e32 v11, v0
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v10, s10
+; GPRIDX-NEXT: s_set_gpr_idx_on s13, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v11
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v11f32_s_v_s:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: v_mov_b32_e32 v11, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_mov_b32 m0, s13
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v6, s6
+; GFX10-NEXT: v_mov_b32_e32 v7, s7
+; GFX10-NEXT: v_mov_b32_e32 v8, s8
+; GFX10-NEXT: v_mov_b32_e32 v9, s9
+; GFX10-NEXT: v_mov_b32_e32 v10, s10
+; GFX10-NEXT: v_movreld_b32_e32 v0, v11
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v11f32_s_v_s:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: s_mov_b32 m0, s13
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s10
+; GFX11-NEXT: v_movreld_b32_e32 v0, v11
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+ ret <11 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_v(<11 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_s_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: v_mov_b32_e32 v22, s10
+; GPRIDX-NEXT: v_mov_b32_e32 v12, s0
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v13, s1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v14, s2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v15, s3
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v16, s4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v17, s5
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v18, s6
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v21, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v20, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v19, s7
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 9, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 10, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v1
+; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5]
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v21, v0, s[0:1]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v22, v0, s[2:3]
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT: v_mov_b32_e32 v1, v11
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v11f32_s_v_v:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: v_mov_b32_e32 v22, s10
+; GFX10-NEXT: v_mov_b32_e32 v12, s0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_mov_b32_e32 v13, s1
+; GFX10-NEXT: v_mov_b32_e32 v14, s2
+; GFX10-NEXT: v_mov_b32_e32 v15, s3
+; GFX10-NEXT: v_mov_b32_e32 v16, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT: v_mov_b32_e32 v17, s5
+; GFX10-NEXT: v_mov_b32_e32 v18, s6
+; GFX10-NEXT: v_mov_b32_e32 v19, s7
+; GFX10-NEXT: v_mov_b32_e32 v20, s8
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT: v_mov_b32_e32 v21, s9
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v0, v12
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v11f32_s_v_v:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v21, s9
+; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT: v_mov_b32_e32 v12, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_dual_mov_b32 v16, s4 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v17, s5
+; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v19, s7
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_cndmask_b32 v10, v22, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, v12
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+ ret <11 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_v_v_s(<11 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_v_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v11
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_s:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: s_mov_b32 m0, s2
+; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v11
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+ ret <11 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_v_v_v(<11 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_v_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_v:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v12
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+ ret <11 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_s_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: s_mov_b32 s11, s13
+; GPRIDX-NEXT: v_mov_b32_e32 v12, v0
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v10, s10
+; GPRIDX-NEXT: v_mov_b32_e32 v11, s11
+; GPRIDX-NEXT: s_set_gpr_idx_on s14, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v12f32_s_v_s:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: s_mov_b32 s11, s13
+; GFX10-NEXT: v_mov_b32_e32 v12, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_mov_b32 m0, s14
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v6, s6
+; GFX10-NEXT: v_mov_b32_e32 v7, s7
+; GFX10-NEXT: v_mov_b32_e32 v8, s8
+; GFX10-NEXT: v_mov_b32_e32 v9, s9
+; GFX10-NEXT: v_mov_b32_e32 v10, s10
+; GFX10-NEXT: v_mov_b32_e32 v11, s11
+; GFX10-NEXT: v_movreld_b32_e32 v0, v12
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v12f32_s_v_s:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: s_mov_b32 s11, s13
+; GFX11-NEXT: v_mov_b32_e32 v12, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: s_mov_b32 m0, s14
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10
+; GFX11-NEXT: v_movreld_b32_e32 v0, v12
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+ ret <12 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_v(<12 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_s_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: s_mov_b32 s11, s13
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: v_mov_b32_e32 v23, s11
+; GPRIDX-NEXT: v_mov_b32_e32 v12, s0
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v13, s1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v14, s2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v15, s3
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v16, s4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v17, s5
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT: v_mov_b32_e32 v22, s10
+; GPRIDX-NEXT: v_mov_b32_e32 v21, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v20, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v19, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v18, s6
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 8, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 9, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 10, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 11, v1
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
+; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[8:9]
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v20, v0, s[0:1]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v21, v0, s[2:3]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v22, v0, s[4:5]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v23, v0, s[6:7]
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT: v_mov_b32_e32 v1, v13
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v12f32_s_v_v:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: s_mov_b32 s11, s13
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: v_mov_b32_e32 v23, s11
+; GFX10-NEXT: v_mov_b32_e32 v12, s0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_mov_b32_e32 v13, s1
+; GFX10-NEXT: v_mov_b32_e32 v14, s2
+; GFX10-NEXT: v_mov_b32_e32 v15, s3
+; GFX10-NEXT: v_mov_b32_e32 v16, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT: v_mov_b32_e32 v17, s5
+; GFX10-NEXT: v_mov_b32_e32 v18, s6
+; GFX10-NEXT: v_mov_b32_e32 v19, s7
+; GFX10-NEXT: v_mov_b32_e32 v20, s8
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT: v_mov_b32_e32 v21, s9
+; GFX10-NEXT: v_mov_b32_e32 v22, s10
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v23, v0, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v0, v12
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v12f32_s_v_v:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: s_mov_b32 s11, s13
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10
+; GFX11-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v14, s2
+; GFX11-NEXT: v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4
+; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6
+; GFX11-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8
+; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v13
+; GFX11-NEXT: v_dual_cndmask_b32 v11, v23, v0 :: v_dual_mov_b32 v0, v12
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+ ret <12 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_v_v_s(<12 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_v_v_s:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT: s_set_gpr_idx_off
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_s:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: s_mov_b32 m0, s2
+; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v12
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+ ret <12 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_v_v_v(<12 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_v_v_v:
+; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 10, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 11, v13
+; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GPRIDX-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_v:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v13
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+ ret <12 x float> %insert
+}
+
define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_s_s(<16 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v16i32_s_s_s:
; GPRIDX: ; %bb.0: ; %entry
@@ -5246,47 +6448,41 @@ entry:
define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: s_mov_b32 s9, s11
-; GPRIDX-NEXT: v_mov_b32_e32 v17, s15
-; GPRIDX-NEXT: v_mov_b32_e32 v16, s14
-; GPRIDX-NEXT: v_mov_b32_e32 v15, s13
-; GPRIDX-NEXT: v_mov_b32_e32 v14, s12
-; GPRIDX-NEXT: v_mov_b32_e32 v13, s11
-; GPRIDX-NEXT: v_mov_b32_e32 v12, s10
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: v_mov_b32_e32 v11, s9
-; GPRIDX-NEXT: v_mov_b32_e32 v10, s8
-; GPRIDX-NEXT: v_mov_b32_e32 v9, s7
-; GPRIDX-NEXT: v_mov_b32_e32 v8, s6
-; GPRIDX-NEXT: v_mov_b32_e32 v7, s5
-; GPRIDX-NEXT: v_mov_b32_e32 v6, s4
-; GPRIDX-NEXT: v_mov_b32_e32 v5, s3
-; GPRIDX-NEXT: v_mov_b32_e32 v4, s2
; GPRIDX-NEXT: v_mov_b32_e32 v3, s1
; GPRIDX-NEXT: v_mov_b32_e32 v2, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 1
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 2
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s2
; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5]
; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2
+; GPRIDX-NEXT: v_mov_b32_e32 v9, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v8, s6
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3
+; GPRIDX-NEXT: v_mov_b32_e32 v10, s8
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GPRIDX-NEXT: v_readfirstlane_b32 s0, v2
; GPRIDX-NEXT: v_readfirstlane_b32 s1, v3
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v4
@@ -5301,22 +6497,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
;
; GFX10-LABEL: dyn_insertelement_v5f64_s_v_s:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
-; GFX10-NEXT: v_mov_b32_e32 v17, s15
-; GFX10-NEXT: v_mov_b32_e32 v16, s14
-; GFX10-NEXT: v_mov_b32_e32 v15, s13
-; GFX10-NEXT: v_mov_b32_e32 v14, s12
-; GFX10-NEXT: v_mov_b32_e32 v13, s11
-; GFX10-NEXT: v_mov_b32_e32 v12, s10
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: v_mov_b32_e32 v11, s9
; GFX10-NEXT: v_mov_b32_e32 v10, s8
; GFX10-NEXT: v_mov_b32_e32 v9, s7
@@ -5356,19 +6546,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
;
; GFX11-LABEL: dyn_insertelement_v5f64_s_v_s:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
-; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
-; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
@@ -5406,77 +6593,65 @@ entry:
define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: s_mov_b32 s9, s11
-; GPRIDX-NEXT: v_mov_b32_e32 v18, s15
-; GPRIDX-NEXT: v_mov_b32_e32 v17, s14
-; GPRIDX-NEXT: v_mov_b32_e32 v16, s13
-; GPRIDX-NEXT: v_mov_b32_e32 v15, s12
-; GPRIDX-NEXT: v_mov_b32_e32 v14, s11
-; GPRIDX-NEXT: v_mov_b32_e32 v13, s10
+; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: v_mov_b32_e32 v12, s9
-; GPRIDX-NEXT: v_mov_b32_e32 v11, s8
-; GPRIDX-NEXT: v_mov_b32_e32 v10, s7
-; GPRIDX-NEXT: v_mov_b32_e32 v9, s6
-; GPRIDX-NEXT: v_mov_b32_e32 v8, s5
-; GPRIDX-NEXT: v_mov_b32_e32 v7, s4
-; GPRIDX-NEXT: v_mov_b32_e32 v6, s3
-; GPRIDX-NEXT: v_mov_b32_e32 v5, s2
; GPRIDX-NEXT: v_mov_b32_e32 v4, s1
; GPRIDX-NEXT: v_mov_b32_e32 v3, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v2
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s2
; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[4:5]
; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v10, v1, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5]
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GPRIDX-NEXT: v_mov_b32_e32 v8, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s4
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
+; GPRIDX-NEXT: v_mov_b32_e32 v11, s8
+; GPRIDX-NEXT: v_mov_b32_e32 v10, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v9, s6
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[0:1]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1]
; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3
; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4
-; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2
+; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5
; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6
-; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5
+; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7
; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8
-; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7
-; GPRIDX-NEXT: v_readfirstlane_b32 s7, v9
+; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9
+; GPRIDX-NEXT: v_readfirstlane_b32 s7, v2
; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0
; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1
; GPRIDX-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: dyn_insertelement_v5f64_s_v_v:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
-; GFX10-NEXT: v_mov_b32_e32 v18, s15
-; GFX10-NEXT: v_mov_b32_e32 v17, s14
-; GFX10-NEXT: v_mov_b32_e32 v16, s13
-; GFX10-NEXT: v_mov_b32_e32 v15, s12
-; GFX10-NEXT: v_mov_b32_e32 v14, s11
-; GFX10-NEXT: v_mov_b32_e32 v13, s10
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: v_mov_b32_e32 v12, s9
; GFX10-NEXT: v_mov_b32_e32 v11, s8
; GFX10-NEXT: v_mov_b32_e32 v10, s7
@@ -5516,19 +6691,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
;
; GFX11-LABEL: dyn_insertelement_v5f64_s_v_v:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
-; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
-; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
@@ -5596,55 +6768,55 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; GFX10-LABEL: dyn_insertelement_v5f64_v_v_s:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v8
+; GFX10-NEXT: v_readfirstlane_b32 s9, v9
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s8, v8
-; GFX10-NEXT: v_readfirstlane_b32 s9, v9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 3
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 2
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 4
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
+; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
-; GFX11-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-NEXT: v_readfirstlane_b32 s8, v8
+; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s8, v8
+; GFX11-NEXT: v_readfirstlane_b32 s6, v6
+; GFX11-NEXT: v_readfirstlane_b32 s7, v7
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: ; return to shader part epilog
entry:
@@ -5685,14 +6857,19 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; GFX10-LABEL: dyn_insertelement_v5f64_v_v_v:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v12
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
+; GFX10-NEXT: v_readfirstlane_b32 s8, v8
+; GFX10-NEXT: v_readfirstlane_b32 s9, v9
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@@ -5702,38 +6879,33 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s8, v8
-; GFX10-NEXT: v_readfirstlane_b32 s9, v9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v12
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
-; GFX11-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-NEXT: v_readfirstlane_b32 s8, v8
+; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s8, v8
+; GFX11-NEXT: v_readfirstlane_b32 s6, v6
+; GFX11-NEXT: v_readfirstlane_b32 s7, v7
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: ; return to shader part epilog
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
index 355ffd1456dc3..d6a433ae00076 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
@@ -688,7 +688,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
%0:sgpr(<3 x s64>) = G_IMPLICIT_DEF
%1:sgpr(<3 x s64>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
index 440e475eedc4b..67142e720219b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
@@ -296,7 +296,7 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr3_sgpr4_sgpr5
; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8
; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_96 = COPY $sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8
@@ -332,7 +332,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_192 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_192 = COPY $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_384 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2
; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5
; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 463b59a4fd9f2..5a7ddddedd279 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -18,7 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray:
; GCN: ; %bb.0:
-; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
@@ -30,7 +30,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray_flat:
; GCN: ; %bb.0:
-; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
@@ -78,7 +78,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray:
; GCN: ; %bb.0:
-; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
@@ -89,7 +89,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ra
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray_flat:
; GCN: ; %bb.0:
-; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
@@ -118,7 +118,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16
; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
; GFX10-NEXT: v_and_or_b32 v7, v7, 0xffff, v11
-; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -159,7 +159,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:30], s[4:7]
+; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:25], s[4:7]
; GFX1030-NEXT: ; implicit-def: $vgpr11
; GFX1030-NEXT: ; implicit-def: $vgpr15
; GFX1030-NEXT: ; implicit-def: $vgpr16
@@ -182,34 +182,30 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
;
; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v16, v11
-; GFX1013-NEXT: v_mov_b32_e32 v17, v12
-; GFX1013-NEXT: v_mov_b32_e32 v18, v13
-; GFX1013-NEXT: v_mov_b32_e32 v19, v14
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v17
-; GFX1013-NEXT: v_readfirstlane_b32 s6, v18
-; GFX1013-NEXT: v_readfirstlane_b32 s7, v19
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
-; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v13
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v14
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT: image_bvh_intersect_ray v[20:23], v[0:15], s[4:7]
-; GFX1013-NEXT: ; implicit-def: $vgpr16
-; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7]
+; GFX1013-NEXT: ; implicit-def: $vgpr11
+; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10
+; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB6_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, v20
-; GFX1013-NEXT: v_mov_b32_e32 v1, v21
-; GFX1013-NEXT: v_mov_b32_e32 v2, v22
-; GFX1013-NEXT: v_mov_b32_e32 v3, v23
+; GFX1013-NEXT: v_mov_b32_e32 v0, v15
+; GFX1013-NEXT: v_mov_b32_e32 v1, v16
+; GFX1013-NEXT: v_mov_b32_e32 v2, v17
+; GFX1013-NEXT: v_mov_b32_e32 v3, v18
; GFX1013-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr:
@@ -391,7 +387,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7]
; GFX1030-NEXT: ; implicit-def: $vgpr12
; GFX1030-NEXT: ; implicit-def: $vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr17
@@ -415,34 +411,30 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
;
; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v16, v12
-; GFX1013-NEXT: v_mov_b32_e32 v17, v13
-; GFX1013-NEXT: v_mov_b32_e32 v18, v14
-; GFX1013-NEXT: v_mov_b32_e32 v19, v15
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v17
-; GFX1013-NEXT: v_readfirstlane_b32 s6, v18
-; GFX1013-NEXT: v_readfirstlane_b32 s7, v19
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
-; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v13
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v14
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v15
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7]
-; GFX1013-NEXT: ; implicit-def: $vgpr16
-; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7]
+; GFX1013-NEXT: ; implicit-def: $vgpr12
+; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB8_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, v20
-; GFX1013-NEXT: v_mov_b32_e32 v1, v21
-; GFX1013-NEXT: v_mov_b32_e32 v2, v22
-; GFX1013-NEXT: v_mov_b32_e32 v3, v23
+; GFX1013-NEXT: v_mov_b32_e32 v0, v16
+; GFX1013-NEXT: v_mov_b32_e32 v1, v17
+; GFX1013-NEXT: v_mov_b32_e32 v2, v18
+; GFX1013-NEXT: v_mov_b32_e32 v3, v19
; GFX1013-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr:
@@ -508,7 +500,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16
; GFX1030-NEXT: ; implicit-def: $vgpr10
; GFX1030-NEXT: ; implicit-def: $vgpr14
; GFX1030-NEXT: ; implicit-def: $vgpr15
@@ -529,42 +521,38 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
;
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: v_mov_b32_e32 v16, v10
-; GFX1013-NEXT: v_mov_b32_e32 v17, v11
-; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6
-; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8
+; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8
; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX1013-NEXT: v_mov_b32_e32 v18, v12
-; GFX1013-NEXT: v_mov_b32_e32 v19, v13
-; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
-; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
-; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v11
+; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14
+; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v15
; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
-; GFX1013-NEXT: v_readfirstlane_b32 s5, v17
-; GFX1013-NEXT: v_readfirstlane_b32 s6, v18
-; GFX1013-NEXT: v_readfirstlane_b32 s7, v19
-; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
-; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT: v_readfirstlane_b32 s4, v10
+; GFX1013-NEXT: v_readfirstlane_b32 s5, v11
+; GFX1013-NEXT: v_readfirstlane_b32 s6, v12
+; GFX1013-NEXT: v_readfirstlane_b32 s7, v13
+; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] a16
-; GFX1013-NEXT: ; implicit-def: $vgpr16
-; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16
+; GFX1013-NEXT: ; implicit-def: $vgpr10
+; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB9_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, v20
-; GFX1013-NEXT: v_mov_b32_e32 v1, v21
-; GFX1013-NEXT: v_mov_b32_e32 v2, v22
-; GFX1013-NEXT: v_mov_b32_e32 v3, v23
+; GFX1013-NEXT: v_mov_b32_e32 v0, v14
+; GFX1013-NEXT: v_mov_b32_e32 v1, v15
+; GFX1013-NEXT: v_mov_b32_e32 v2, v16
+; GFX1013-NEXT: v_mov_b32_e32 v3, v17
; GFX1013-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
@@ -631,7 +619,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
@@ -661,7 +649,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
@@ -885,7 +873,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
@@ -914,7 +902,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
@@ -1012,7 +1000,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v7, s4
; GFX1030-NEXT: v_mov_b32_e32 v8, s6
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
@@ -1056,7 +1044,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v7, s0
; GFX1013-NEXT: v_mov_b32_e32 v8, s2
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] a16
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
index 70e486c22187d..34f83b0ae2ecb 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
@@ -1,7 +1,7 @@
# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
# Check that %11 and %20 have been coalesced.
-# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]]
-# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]]
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG:[0-9]+]]
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG]]
---
name: main
@@ -17,9 +17,9 @@ registers:
- { id: 6, class: sgpr_128 }
- { id: 7, class: sgpr_512 }
- { id: 9, class: vreg_512 }
- - { id: 11, class: vreg_512 }
+ - { id: 11, class: vreg_352 }
- { id: 18, class: vgpr_32 }
- - { id: 20, class: vreg_512 }
+ - { id: 20, class: vreg_352 }
- { id: 27, class: vgpr_32 }
liveins:
- { reg: '$sgpr2_sgpr3', virtual-reg: '%0' }
@@ -61,7 +61,7 @@ body: |
%11.sub6 = COPY %1
%11.sub7 = COPY %1
%11.sub8 = COPY %1
- dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
+ dead %18 = IMAGE_SAMPLE_C_D_O_V1_V11 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
%20.sub1 = COPY %2
%20.sub2 = COPY %2
%20.sub3 = COPY %2
@@ -70,6 +70,6 @@ body: |
%20.sub6 = COPY %2
%20.sub7 = COPY %2
%20.sub8 = COPY %2
- dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
+ dead %27 = IMAGE_SAMPLE_C_D_O_V1_V11 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
...
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index d17ed9302fb5c..eb0fc7e06d177 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -171,14 +171,20 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s26, -1
+; SI-NEXT: s_mov_b32 s27, 0xe8f000
+; SI-NEXT: s_add_u32 s24, s24, s3
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_addc_u32 s25, s25, 0
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s22, s10
; SI-NEXT: s_mov_b32 s23, s11
@@ -197,24 +203,30 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
;
; VI-LABEL: test_copy_v4i8_x4:
; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s90, -1
+; VI-NEXT: s_mov_b32 s91, 0xe80000
+; VI-NEXT: s_add_u32 s88, s88, s3
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s22, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s22, s10
; VI-NEXT: s_mov_b32 s23, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
index 2b8a712b28c05..2b4651487eff6 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
@@ -7,21 +7,27 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
; RRLIST-LABEL: sccClobber:
; RRLIST: ; %bb.0: ; %entry
; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; RRLIST-NEXT: v_mov_b32_e32 v2, 0
+; RRLIST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; RRLIST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; RRLIST-NEXT: s_mov_b32 s22, -1
+; RRLIST-NEXT: s_mov_b32 s23, 0xe00000
+; RRLIST-NEXT: s_add_u32 s20, s20, s3
; RRLIST-NEXT: s_waitcnt lgkmcnt(0)
; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44
; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0
+; RRLIST-NEXT: s_addc_u32 s21, s21, 0
; RRLIST-NEXT: s_waitcnt lgkmcnt(0)
-; RRLIST-NEXT: s_min_i32 s4, s16, 0
; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; RRLIST-NEXT: s_min_i32 s4, s16, 0
; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec
; RRLIST-NEXT: s_cselect_b32 s0, s16, s17
; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3]
; RRLIST-NEXT: s_cselect_b32 s0, s4, s0
+; RRLIST-NEXT: v_mov_b32_e32 v2, 0
; RRLIST-NEXT: v_mov_b32_e32 v0, s0
; RRLIST-NEXT: global_store_dword v2, v0, s[14:15]
; RRLIST-NEXT: s_endpgm
@@ -29,21 +35,27 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
; FAST-LABEL: sccClobber:
; FAST: ; %bb.0: ; %entry
; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; FAST-NEXT: v_mov_b32_e32 v2, 0
+; FAST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; FAST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; FAST-NEXT: s_mov_b32 s22, -1
+; FAST-NEXT: s_mov_b32 s23, 0xe00000
+; FAST-NEXT: s_add_u32 s20, s20, s3
; FAST-NEXT: s_waitcnt lgkmcnt(0)
; FAST-NEXT: s_load_dword s16, s[8:9], 0x0
; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44
; FAST-NEXT: s_load_dword s17, s[10:11], 0x0
+; FAST-NEXT: s_addc_u32 s21, s21, 0
; FAST-NEXT: s_waitcnt lgkmcnt(0)
-; FAST-NEXT: s_min_i32 s4, s16, 0
; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; FAST-NEXT: s_min_i32 s4, s16, 0
; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
; FAST-NEXT: s_and_b64 s[0:1], vcc, exec
; FAST-NEXT: s_cselect_b32 s0, s16, s17
; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3]
; FAST-NEXT: s_cselect_b32 s0, s4, s0
+; FAST-NEXT: v_mov_b32_e32 v2, 0
; FAST-NEXT: v_mov_b32_e32 v0, s0
; FAST-NEXT: global_store_dword v2, v0, s[14:15]
; FAST-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index ff0171095b842..8110e04c133f1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -310,7 +310,7 @@ define <4 x i64> @v4i64_func_void() #0 {
; GCN-LABEL: {{^}}v5i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx2 v[8:9], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <5 x i64> @v5i64_func_void() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e1b9bb1af72a5..446f81e45415c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -385,22 +385,19 @@ bb7: ; preds = %bb4, %bb1
; GCN: s_load_dword [[ARG:s[0-9]+]]
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
+; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; MOVREL: s_waitcnt
; MOVREL: s_add_i32 m0, [[ARG]], -16
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
-; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
; MOVREL: s_mov_b32 m0, -1
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
+; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; IDXMODE: s_waitcnt
; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
-; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; IDXMODE: s_set_gpr_idx_off
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
-; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
; IDXMODE: s_set_gpr_idx_off
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 54111a4e1a09d..ade5d5154d749 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -610,10 +610,16 @@ entry:
define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
; GCN-LABEL: double5_inselt:
; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: s_mov_b32 s19, 0xe80000
+; GCN-NEXT: s_add_u32 s16, s16, s3
; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84
; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s12, 4
; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
@@ -622,10 +628,8 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
; GCN-NEXT: s_cmp_eq_u32 s12, 0
-; GCN-NEXT: v_mov_b32_e32 v4, s8
-; GCN-NEXT: v_mov_b32_e32 v5, s9
-; GCN-NEXT: s_cselect_b32 s8, 0x3ff00000, s1
-; GCN-NEXT: s_cselect_b32 s9, 0, s0
+; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1
+; GCN-NEXT: s_cselect_b32 s14, 0, s0
; GCN-NEXT: s_cmp_eq_u32 s12, 3
; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
; GCN-NEXT: s_cselect_b32 s1, 0, s6
@@ -636,23 +640,26 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
; GCN-NEXT: s_add_u32 s0, s10, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_addc_u32 s1, s11, 0
-; GCN-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: v_mov_b32_e32 v6, s0
-; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v6, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s9
-; GCN-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT: v_mov_b32_e32 v4, s10
+; GCN-NEXT: s_add_u32 s0, s10, 32
+; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v7, s11
-; GCN-NEXT: s_add_u32 s0, s10, 32
-; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-NEXT: v_mov_b32_e32 v5, s11
; GCN-NEXT: s_addc_u32 s1, s11, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index c47760a01547f..aaa4833482e30 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -6,7 +6,7 @@
; FIXME: For some reason the 8 and 16 vectors are being stored as
; individual elements instead of 128-bit stores.
-define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind {
; SI-LABEL: insertelement_v2f32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -37,7 +37,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind {
; SI-LABEL: insertelement_v2f32_1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -68,7 +68,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
; SI-LABEL: insertelement_v2i32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -99,7 +99,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <
ret void
}
-define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
; SI-LABEL: insertelement_v2i32_1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -132,7 +132,7 @@ define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <
; FIXME: Why is the constant moved into the intermediate register and
; not just directly into the vector component?
-define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind {
; SI-LABEL: insertelement_v4f32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -167,7 +167,7 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind {
; SI-LABEL: insertelement_v4f32_1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -202,7 +202,7 @@ define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind {
; SI-LABEL: insertelement_v4f32_2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -237,7 +237,7 @@ define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind {
; SI-LABEL: insertelement_v4f32_3:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -272,7 +272,7 @@ define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind {
; SI-LABEL: insertelement_v4i32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -307,7 +307,7 @@ define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <
ret void
}
-define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind {
; SI-LABEL: insertelement_v3f32_1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -338,7 +338,7 @@ define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind {
; SI-LABEL: insertelement_v3f32_2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -369,7 +369,7 @@ define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind {
; GCN-LABEL: insertelement_v3f32_3:
; GCN: ; %bb.0:
; GCN-NEXT: s_endpgm
@@ -394,7 +394,107 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
ret <4 x float> %tmp2
}
-define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+define <9 x float> @insertelement_to_v9f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v9f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT: v_mov_b32_e32 v2, 0xc0a00000
+; GCN-NEXT: v_mov_b32_e32 v7, 0x41880000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v4, s8
+; GCN-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v8, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %tmp = load <9 x float>, <9 x float> addrspace(4)* undef
+ %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0
+ %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2
+ %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7
+ ret <9 x float> %tmp3
+}
+
+define <10 x float> @insertelement_to_v10f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v10f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 2.0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v4, s8
+; GCN-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NEXT: v_mov_b32_e32 v7, s11
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v8, s12
+; GCN-NEXT: v_mov_b32_e32 v9, s13
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %tmp = load <10 x float>, <10 x float> addrspace(4)* undef
+ %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0
+ ret <10 x float> %tmp1
+}
+
+define <11 x float> @insertelement_to_v11f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v11f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v4, s8
+; GCN-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NEXT: v_mov_b32_e32 v7, s11
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v8, s12
+; GCN-NEXT: v_mov_b32_e32 v9, s13
+; GCN-NEXT: v_mov_b32_e32 v10, s14
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %tmp = load <11 x float>, <11 x float> addrspace(4)* undef
+ %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0
+ ret <11 x float> %tmp1
+}
+
+define <12 x float> @insertelement_to_v12f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v12f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 4.0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v4, s8
+; GCN-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NEXT: v_mov_b32_e32 v7, s11
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v8, s12
+; GCN-NEXT: v_mov_b32_e32 v9, s13
+; GCN-NEXT: v_mov_b32_e32 v10, s14
+; GCN-NEXT: v_mov_b32_e32 v11, s15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %tmp = load <12 x float>, <12 x float> addrspace(4)* undef
+ %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0
+ ret <12 x float> %tmp1
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -441,7 +541,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x8
@@ -494,7 +594,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x8
@@ -555,7 +655,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -606,7 +706,249 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)*
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v9f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dword s6, s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s6
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_movreld_b32_e32 v0, v9
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v9f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: s_load_dword s6, s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: v_mov_b32_e32 v8, s6
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_movreld_b32_e32 v0, v9
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b
+ store <9 x float> %vecins, <9 x float> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v10f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s6
+; SI-NEXT: v_mov_b32_e32 v9, s7
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_movreld_b32_e32 v0, v10
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v10f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: v_mov_b32_e32 v8, s6
+; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_movreld_b32_e32 v0, v10
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b
+ store <10 x float> %vecins, <10 x float> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v11f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s16
+; SI-NEXT: v_mov_b32_e32 v9, s17
+; SI-NEXT: v_mov_b32_e32 v10, s18
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_movreld_b32_e32 v0, v11
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v11f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mov_b32_e32 v9, s9
+; VI-NEXT: v_mov_b32_e32 v10, s10
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_movreld_b32_e32 v0, v11
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b
+ store <11 x float> %vecins, <11 x float> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v12f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s16
+; SI-NEXT: v_mov_b32_e32 v9, s17
+; SI-NEXT: v_mov_b32_e32 v10, s18
+; SI-NEXT: v_mov_b32_e32 v11, s19
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_movreld_b32_e32 v0, v12
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v12f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mov_b32_e32 v9, s9
+; VI-NEXT: v_mov_b32_e32 v10, s10
+; VI-NEXT: v_mov_b32_e32 v11, s11
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_movreld_b32_e32 v0, v12
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b
+ store <12 x float> %vecins, <12 x float> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -677,7 +1019,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -718,7 +1060,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v3i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x8
@@ -763,7 +1105,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
; SI-LABEL: dynamic_insertelement_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -816,7 +1158,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
@@ -865,7 +1207,241 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v9i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dword s6, s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s6
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: v_movreld_b32_e32 v0, 5
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v9i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: s_load_dword s6, s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: v_mov_b32_e32 v8, s6
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_movreld_b32_e32 v0, 5
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <9 x i32> %a, i32 5, i32 %b
+ store <9 x i32> %vecins, <9 x i32> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v10i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s6
+; SI-NEXT: v_mov_b32_e32 v9, s7
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: v_movreld_b32_e32 v0, 5
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v10i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: v_mov_b32_e32 v8, s6
+; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_movreld_b32_e32 v0, 5
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <10 x i32> %a, i32 5, i32 %b
+ store <10 x i32> %vecins, <10 x i32> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v11i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s16
+; SI-NEXT: v_mov_b32_e32 v9, s17
+; SI-NEXT: v_mov_b32_e32 v10, s18
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: v_movreld_b32_e32 v0, 5
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v11i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mov_b32_e32 v9, s9
+; VI-NEXT: v_mov_b32_e32 v10, s10
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_movreld_b32_e32 v0, 5
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <11 x i32> %a, i32 5, i32 %b
+ store <11 x i32> %vecins, <11 x i32> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v12i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT: s_load_dword s4, s[4:5], 0x20
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v8, s16
+; SI-NEXT: v_mov_b32_e32 v9, s17
+; SI-NEXT: v_mov_b32_e32 v10, s18
+; SI-NEXT: v_mov_b32_e32 v11, s19
+; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: v_movreld_b32_e32 v0, 5
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v12i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT: s_load_dword s4, s[4:5], 0x80
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mov_b32_e32 v9, s9
+; VI-NEXT: v_mov_b32_e32 v10, s10
+; VI-NEXT: v_mov_b32_e32 v11, s11
+; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_movreld_b32_e32 v0, 5
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT: s_endpgm
+ %vecins = insertelement <12 x i32> %a, i32 5, i32 %b
+ store <12 x i32> %vecins, <12 x i32> addrspace(1)* %out, align 32
+ ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
@@ -934,7 +1510,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)*
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -973,7 +1549,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1023,7 +1599,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
@@ -1065,7 +1641,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; FIXME: post legalize i16 and i32 shifts aren't merged because of
; isTypeDesirableForOp in SimplifyDemandedBits
-define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v3i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
@@ -1110,7 +1686,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
@@ -1149,7 +1725,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
ret void
}
-define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
+define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
; SI-LABEL: s_dynamic_insertelement_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1201,7 +1777,7 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
@@ -1410,24 +1986,24 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
; the compiler doesn't crash.
-define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
; SI-LABEL: insert_split_bb:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0x4
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s6, 0
-; SI-NEXT: s_cbranch_scc0 .LBB30_4
+; SI-NEXT: s_cbranch_scc0 .LBB42_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dword s7, s[2:3], 0x1
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB30_3
-; SI-NEXT: .LBB30_2: ; %if
+; SI-NEXT: s_cbranch_vccnz .LBB42_3
+; SI-NEXT: .LBB42_2: ; %if
; SI-NEXT: s_load_dword s7, s[2:3], 0x0
-; SI-NEXT: .LBB30_3: ; %endif
+; SI-NEXT: .LBB42_3: ; %endif
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s3, 0x100f000
@@ -1435,8 +2011,8 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
-; SI-NEXT: .LBB30_4:
-; SI-NEXT: s_branch .LBB30_2
+; SI-NEXT: .LBB42_4:
+; SI-NEXT: s_branch .LBB42_2
;
; VI-LABEL: insert_split_bb:
; VI: ; %bb.0: ; %entry
@@ -1444,14 +2020,14 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s6, 0
-; VI-NEXT: s_cbranch_scc0 .LBB30_4
+; VI-NEXT: s_cbranch_scc0 .LBB42_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_load_dword s7, s[2:3], 0x4
-; VI-NEXT: s_cbranch_execnz .LBB30_3
-; VI-NEXT: .LBB30_2: ; %if
+; VI-NEXT: s_cbranch_execnz .LBB42_3
+; VI-NEXT: .LBB42_2: ; %if
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s7, s[2:3], 0x0
-; VI-NEXT: .LBB30_3: ; %endif
+; VI-NEXT: .LBB42_3: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_mov_b32 s3, 0x1100f000
@@ -1459,8 +2035,8 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
-; VI-NEXT: .LBB30_4:
-; VI-NEXT: s_branch .LBB30_2
+; VI-NEXT: .LBB42_4:
+; VI-NEXT: s_branch .LBB42_2
entry:
%0 = insertelement <2 x i32> undef, i32 %a, i32 0
%1 = icmp eq i32 %a, 0
@@ -1483,7 +2059,7 @@ endif:
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x18
@@ -1530,7 +2106,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x8
@@ -1577,7 +2153,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x10
@@ -1638,7 +2214,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x10
@@ -1709,7 +2285,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
ret void
}
-define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
+define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 {
; SI-LABEL: dynamic_insertelement_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x20
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
index fa5e1b6c34540..f189a0ffffd4f 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
@@ -1,19 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
; Make sure the expected regmask is generated for sub/superregisters.
-; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
+; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
define void @csr() #0 {
call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
ret void
}
-; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
+; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @subregs_for_super() #0 {
call void asm sideeffect "", "~{v0},~{v1}"() #0
ret void
}
-; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
+; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @clobbered_reg_with_sub() #0 {
call void asm sideeffect "", "~{v[0:1]}"() #0
ret void
@@ -44,3 +45,5 @@ define void @vcc() #0 {
i8* bitcast (void ()* @vcc to i8*)]
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 09e97d485782a..c03cf9edc44be 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -2286,57 +2286,56 @@ define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5
;
; VI-LABEL: v5i64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s8, s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: s_addc_u32 s9, s3, 0
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v5, s9
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s12, s8, 32
+; VI-NEXT: v_mov_b32_e32 v1, s10
+; VI-NEXT: s_addc_u32 s13, s9, 0
+; VI-NEXT: v_mov_b32_e32 v3, s12
+; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s13
+; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_addc_u32 s5, s9, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5i64_arg:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5i64_arg:
@@ -2429,57 +2428,56 @@ define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out,
;
; VI-LABEL: v5f64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s8, s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: s_addc_u32 s9, s3, 0
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v5, s9
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s12, s8, 32
+; VI-NEXT: v_mov_b32_e32 v1, s10
+; VI-NEXT: s_addc_u32 s13, s9, 0
+; VI-NEXT: v_mov_b32_e32 v3, s12
+; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s13
+; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_addc_u32 s5, s9, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5f64_arg:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5f64_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
index 018d0388f5bfb..81cb2fd5d5013 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@@ -32,9 +32,9 @@ main_body:
}
; GCN-LABEL: {{^}}sample_d_3d:
-; GFX1010-NSA: image_sample_d v[0:3], v[7:22],
+; GFX1010-NSA: image_sample_d v[0:3], v[7:15],
; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
-; GFX11-NSA: image_sample_d v[0:3], v[7:22],
+; GFX11-NSA: image_sample_d v[0:3], v[7:15],
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 90c357b3ea997..4be3180a24344 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1568,19 +1568,19 @@ main_body:
define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
; VERDE-LABEL: sample_c_d_o_2darray_V1:
; VERDE: ; %bb.0: ; %main_body
-; VERDE-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
+; VERDE-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_d_o_2darray_V1:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
+; GFX6789-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_c_d_o_2darray_V1:
; GFX10PLUS: ; %bb.0: ; %main_body
-; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
@@ -1593,7 +1593,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: v_mov_b32_e32 v9, 0
; VERDE-NEXT: v_mov_b32_e32 v10, v9
-; VERDE-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
+; VERDE-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da
; VERDE-NEXT: s_mov_b32 s15, 0xf000
; VERDE-NEXT: s_mov_b32 s14, -1
; VERDE-NEXT: s_waitcnt vmcnt(0)
@@ -1608,7 +1608,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX6789-NEXT: v_mov_b32_e32 v12, v11
; GFX6789-NEXT: v_mov_b32_e32 v9, v11
; GFX6789-NEXT: v_mov_b32_e32 v10, v12
-; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
+; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: v_mov_b32_e32 v0, v9
; GFX6789-NEXT: global_store_dword v11, v10, s[12:13]
@@ -1621,7 +1621,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX10-NEXT: v_mov_b32_e32 v12, v11
; GFX10-NEXT: v_mov_b32_e32 v9, v11
; GFX10-NEXT: v_mov_b32_e32 v10, v12
-; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v9
; GFX10-NEXT: global_store_dword v11, v10, s[12:13]
@@ -1633,7 +1633,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
-; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v9
; GFX11-NEXT: global_store_b32 v11, v10, s[12:13]
@@ -1650,19 +1650,19 @@ main_body:
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
; VERDE-LABEL: sample_c_d_o_2darray_V2:
; VERDE: ; %bb.0: ; %main_body
-; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
+; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_d_o_2darray_V2:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
+; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2:
; GFX10PLUS: ; %bb.0: ; %main_body
-; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
@@ -1676,7 +1676,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; VERDE-NEXT: v_mov_b32_e32 v9, 0
; VERDE-NEXT: v_mov_b32_e32 v10, v9
; VERDE-NEXT: v_mov_b32_e32 v11, v9
-; VERDE-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da
+; VERDE-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: v_mov_b32_e32 v0, v9
; VERDE-NEXT: v_mov_b32_e32 v1, v10
@@ -1688,7 +1688,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX6789-NEXT: v_mov_b32_e32 v9, 0
; GFX6789-NEXT: v_mov_b32_e32 v10, v9
; GFX6789-NEXT: v_mov_b32_e32 v11, v9
-; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da
+; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: v_mov_b32_e32 v0, v9
; GFX6789-NEXT: v_mov_b32_e32 v1, v10
@@ -1700,7 +1700,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, v9
; GFX10-NEXT: v_mov_b32_e32 v11, v9
-; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v9
; GFX10-NEXT: v_mov_b32_e32 v1, v10
@@ -1712,7 +1712,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX11-NEXT: v_mov_b32_e32 v9, 0
; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
-; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v11
; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
index e813fcd8f5299..e4887b8b0f6c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
@@ -186,7 +186,7 @@ main_body:
}
; GCN-LABEL: {{^}}sample_c_d_cl_o_2d:
-; GCN: image_sample_c_d_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}}
+; GCN: image_sample_c_d_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}}
define amdgpu_ps <4 x float> @sample_c_d_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -250,7 +250,7 @@ main_body:
}
; GCN-LABEL: {{^}}sample_c_cd_cl_o_2d:
-; GCN: image_sample_c_cd_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}}
+; GCN: image_sample_c_cd_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}}
define amdgpu_ps <4 x float> @sample_c_cd_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 5c314044d5ff5..10b77e967a31d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -20,7 +20,7 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray:
; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
@@ -90,7 +90,7 @@ main_body:
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray:
; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
@@ -128,7 +128,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr,
; GFX10-NEXT: v_mov_b32_e32 v8, s8
; GFX10-NEXT: s_mov_b32 s15, s13
; GFX10-NEXT: s_mov_b32 s13, s11
-; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[12:15] a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -182,7 +182,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
@@ -208,7 +208,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
@@ -370,7 +370,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
@@ -396,7 +396,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
@@ -461,7 +461,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
@@ -484,7 +484,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index fd5c94868fe95..dc246b42376a9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -9,7 +9,7 @@
; GCN: s_load_dword s{{[0-9]+}}
; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load i32, i32 addrspace(4)* %in
store i32 %ld, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
; GCN: s_load_dwordx2
; EG: VTX_READ_64
-define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -31,7 +31,7 @@ entry:
; GCN: s_load_dwordx4
; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <3 x i32>, <3 x i32> addrspace(4)* %in
store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
; GCN: s_load_dwordx4
; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -54,13 +54,69 @@ entry:
; EG: VTX_READ_128
; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
ret void
}
+; FUNC-LABEL: {{^}}constant_load_v9i32:
+; GCN: s_load_dword
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_32
+define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+ %ld = load <9 x i32>, <9 x i32> addrspace(4)* %in
+ store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v10i32:
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+ %ld = load <10 x i32>, <10 x i32> addrspace(4)* %in
+ store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v11i32:
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+ %ld = load <11 x i32>, <11 x i32> addrspace(4)* %in
+ store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v12i32:
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+ %ld = load <12 x i32>, <12 x i32> addrspace(4)* %in
+ store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}constant_load_v16i32:
; GCN: s_load_dwordx16
@@ -68,7 +124,7 @@ entry:
; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -83,7 +139,7 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
; EG: CF_END
; EG: VTX_READ_32
-define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load i32, i32 addrspace(4)* %in
%ext = zext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
@@ -100,7 +156,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out,
; EG: VTX_READ_32
; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
; EG: 31
-define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load i32, i32 addrspace(4)* %in
%ext = sext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
@@ -110,7 +166,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out,
; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
; GCN: s_load_dword
; GCN: store_dwordx2
-define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -121,7 +177,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(
; GCN: s_load_dword s[[LO:[0-9]+]]
; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
; GCN: store_dwordx2
-define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -131,7 +187,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(
; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: store_dwordx4
-define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -145,7 +201,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(
; GCN-DAG: s_ashr_i32
; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -157,7 +213,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(
; GCN: store_dwordx4
; GCN: store_dwordx4
-define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -174,7 +230,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(
; GCN: store_dwordx4
; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -193,7 +249,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-SA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -221,7 +277,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -242,7 +298,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(
; GCN: store_dwordx4
; GCN: store_dwordx4
; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -269,7 +325,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspa
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -321,7 +377,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspa
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -372,7 +428,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspa
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -424,7 +480,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspa
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index e0cad5af38dd2..dd516dcdd8a63 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -10,7 +10,7 @@
; GCN-HSA: flat_load_dword
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load float, float addrspace(1)* %in
store float %tmp0, float addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
; GCN-HSA: flat_load_dwordx2
; R600: VTX_READ_64
-define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
; GCNX3-HSA: flat_load_dwordx3
; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
; GCN-HSA: flat_load_dwordx4
; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
@@ -62,13 +62,89 @@ entry:
; R600: VTX_READ_128
; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
ret void
}
+; FUNC-LABEL: {{^}}global_load_v9f32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dword
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_32
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in
+ store <9 x float> %tmp0, <9 x float> addrspace(1)* %out
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}global_load_v10f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx2
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in
+ store <10 x float> %tmp0, <10 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v11f32:
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx3
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx3
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in
+ store <11 x float> %tmp0, <11 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v12f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in
+ store <12 x float> %tmp0, <12 x float> addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}global_load_v16f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
@@ -84,7 +160,7 @@ entry:
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index b47cbcb6e00e4..e5f25723cb62e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -10,7 +10,7 @@
; GCN-HSA: {{flat|global}}_load_dword
; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load i32, i32 addrspace(1)* %in
store i32 %ld, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx2
; EG: VTX_READ_64
-define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
; GCNX3-HSA: {{flat|global}}_load_dwordx3
; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx4
; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -62,13 +62,73 @@ entry:
; EG: VTX_READ_128
; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
ret void
}
+; FUNC-LABEL: {{^}}global_load_v9i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dword
+define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %ld = load <9 x i32>, <9 x i32> addrspace(1)* %in
+ store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v10i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx2
+define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %ld = load <10 x i32>, <10 x i32> addrspace(1)* %in
+ store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v11i32:
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx3
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx3
+define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %ld = load <11 x i32>, <11 x i32> addrspace(1)* %in
+ store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}global_load_v12i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+ %ld = load <12 x i32>, <12 x i32> addrspace(1)* %in
+ store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}global_load_v16i32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
@@ -84,7 +144,7 @@ entry:
; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -100,7 +160,7 @@ entry:
; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load i32, i32 addrspace(1)* %in
%ext = zext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
@@ -119,7 +179,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i3
; EG: VTX_READ_32
; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
; EG: 31
-define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load i32, i32 addrspace(1)* %in
%ext = sext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
@@ -132,7 +192,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i3
; GCN-HSA: {{flat|global}}_load_dword
; GCN-HSA: {{flat|global}}_store_dwordx2
-define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -145,7 +205,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -158,7 +218,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
; GCN-HSA: {{flat|global}}_load_dwordx2
; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -174,7 +234,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)
; GCN-NOHSA-DAG: buffer_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -189,7 +249,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -210,7 +270,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -233,7 +293,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -265,7 +325,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -311,7 +371,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)
; GCN-DAG: v_ashrrev_i32
; GCN-NOHSA-DAG: buffer_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -346,7 +406,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -446,7 +506,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -513,7 +573,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -581,7 +641,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
index 3c13cda7efca7..67861a395cd2c 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
@@ -714,7 +714,7 @@ body: |
# GFX11-LABEL: name: image_sample_c_d_cl_o_merged_v1v3
-# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V16_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V9_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
@@ -726,9 +726,9 @@ body: |
%2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
%3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0
%4:vgpr_32 = COPY %2.sub3
- %5:vreg_512 = IMPLICIT_DEF
- %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
- %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+ %5:vreg_288 = IMPLICIT_DEF
+ %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+ %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
...
---
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index c815945d03d63..033afeb3adcac 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -5,14 +5,19 @@
define amdgpu_kernel void @select_f16(
; SI-LABEL: select_f16:
; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s26, -1
+; SI-NEXT: s_mov_b32 s27, 0xe8f000
+; SI-NEXT: s_add_u32 s24, s24, s3
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
+; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s20, s8
; SI-NEXT: s_mov_b32 s21, s9
@@ -34,6 +39,7 @@ define amdgpu_kernel void @select_f16(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_addc_u32 s25, s25, 0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
@@ -46,14 +52,19 @@ define amdgpu_kernel void @select_f16(
;
; VI-LABEL: select_f16:
; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s26, -1
+; VI-NEXT: s_mov_b32 s27, 0xe80000
+; VI-NEXT: s_add_u32 s24, s24, s3
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
-; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s6
; VI-NEXT: s_mov_b32 s17, s7
+; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s20, s8
; VI-NEXT: s_mov_b32 s21, s9
@@ -75,6 +86,7 @@ define amdgpu_kernel void @select_f16(
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_addc_u32 s25, s25, 0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -420,14 +432,19 @@ entry:
define amdgpu_kernel void @select_v2f16(
; SI-LABEL: select_v2f16:
; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s26, -1
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT: s_mov_b32 s27, 0xe8f000
+; SI-NEXT: s_add_u32 s24, s24, s3
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
+; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s20, s8
; SI-NEXT: s_mov_b32 s21, s9
@@ -445,6 +462,7 @@ define amdgpu_kernel void @select_v2f16(
; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_addc_u32 s25, s25, 0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
@@ -474,8 +492,13 @@ define amdgpu_kernel void @select_v2f16(
;
; VI-LABEL: select_v2f16:
; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
+; VI-NEXT: s_mov_b32 s26, -1
+; VI-NEXT: s_mov_b32 s27, 0xe80000
+; VI-NEXT: s_add_u32 s24, s24, s3
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s14, s2
@@ -499,6 +522,7 @@ define amdgpu_kernel void @select_v2f16(
; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_addc_u32 s25, s25, 0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index b49be21442694..f89db580af945 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -67,14 +67,19 @@ entry:
define amdgpu_kernel void @madak_f16_use_2(
; SI-LABEL: madak_f16_use_2:
; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s22, -1
+; SI-NEXT: s_mov_b32 s23, 0xe8f000
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT: s_add_u32 s20, s20, s3
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s8
; SI-NEXT: s_mov_b32 s17, s9
+; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s8, s10
; SI-NEXT: s_mov_b32 s9, s11
@@ -91,6 +96,7 @@ define amdgpu_kernel void @madak_f16_use_2(
; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_addc_u32 s21, s21, 0
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -106,14 +112,19 @@ define amdgpu_kernel void @madak_f16_use_2(
;
; VI-LABEL: madak_f16_use_2:
; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s22, -1
+; VI-NEXT: s_mov_b32 s23, 0xe80000
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
+; VI-NEXT: s_add_u32 s20, s20, s3
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s8
; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
@@ -130,6 +141,7 @@ define amdgpu_kernel void @madak_f16_use_2(
; VI-NEXT: v_mov_b32_e32 v3, 0x4900
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_addc_u32 s21, s21, 0
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
index 506d9cd123a95..e75a5c7aabb52 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
@@ -8,11 +8,11 @@ body: |
; GCN-LABEL: name: waitcnt-check-inorder
; GCN: S_WAITCNT 0
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_ENDPGM 0
- $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
- $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
S_ENDPGM 0
...
---
@@ -22,11 +22,11 @@ body: |
; GCN-LABEL: name: waitcnt-check-vs-vmem
; GCN: S_WAITCNT 0
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_WAITCNT 16240
; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
- $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0
...
@@ -37,11 +37,11 @@ body: |
; GCN-LABEL: name: waitcnt-check-vs-mimg-samp
; GCN: S_WAITCNT 0
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_WAITCNT 16240
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
; GCN-NEXT: S_ENDPGM 0
- $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
S_ENDPGM 0
...
@@ -54,10 +54,10 @@ body: |
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 16240
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
- $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
S_ENDPGM 0
...
---
@@ -69,9 +69,9 @@ body: |
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
; GCN-NEXT: S_WAITCNT 16240
- ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
- $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
S_ENDPGM 0
...
diff --git a/llvm/test/MC/AMDGPU/gfx1013.s b/llvm/test/MC/AMDGPU/gfx1013.s
index 3b1f634b0d40e..b99265feaad9d 100644
--- a/llvm/test/MC/AMDGPU/gfx1013.s
+++ b/llvm/test/MC/AMDGPU/gfx1013.s
@@ -1,28 +1,28 @@
// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck %s
-image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11]
+image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11]
// CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x02,0x00]
-image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] a16
-// CHECK: [0x01,0x9f,0x9c,0xf1,0xf0,0x05,0x02,0x40]
+image_bvh64_intersect_ray v[5:8], v[247:255], s[8:11] a16
+// CHECK: [0x01,0x9f,0x9c,0xf1,0xf7,0x05,0x02,0x40]
-image_bvh64_intersect_ray v[5:8], v[1:16], ttmp[12:15]
+image_bvh64_intersect_ray v[5:8], v[1:12], ttmp[12:15]
// CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x1e,0x00]
image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15]
-// CHECK: encoding: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00]
+// CHECK: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00]
-image_bvh_intersect_ray v[252:255], v[1:16], s[8:11]
+image_bvh_intersect_ray v[252:255], v[1:11], s[8:11]
// CHECK: [0x01,0x9f,0x98,0xf1,0x01,0xfc,0x02,0x00]
image_bvh_intersect_ray v[5:8], v[248:255], s[8:11] a16
// CHECK: [0x01,0x9f,0x98,0xf1,0xf8,0x05,0x02,0x40]
-image_bvh_intersect_ray v[5:8], v[1:16], ttmp[12:15]
+image_bvh_intersect_ray v[5:8], v[1:11], ttmp[12:15]
// CHECK: [0x01,0x9f,0x98,0xf1,0x01,0x05,0x1e,0x00]
image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]
-// CHECK: encoding: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00]
+// CHECK: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00]
image_msaa_load v[5:6], v[1:4], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
// CHECK: [0x39,0x03,0x00,0xf0,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1030_new.s b/llvm/test/MC/AMDGPU/gfx1030_new.s
index c6eb05ba02ae0..bb99e6fc3eda5 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_new.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_new.s
@@ -84,16 +84,16 @@ v_fmac_legacy_f32 v0, |v1|, -v2
v_fmac_legacy_f32 v0, s1, 2.0
// GFX10: encoding: [0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00]
-image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
+image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00]
image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40]
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
+image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00]
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
+image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40]
image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
index 5216bc4524a67..45ecf93a1d306 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
@@ -298,8 +298,8 @@ image_sample_d v[64:66], v[32:37], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG
image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x14,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x00,0x14,0x15]
-image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
-; GFX10: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03]
+image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
+; GFX10: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03]
image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE
; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x1c,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x05,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
index f44b864a6da85..b36722e0f8e43 100644
--- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
@@ -761,10 +761,10 @@ global_store_d16_hi_b8 v1, v2, s[104:105]
global_store_dword_addtid v1, off offset:16 glc slc dlc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16
+image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16
// GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-image_bvh_intersect_ray v[252:255], v[1:16], s[8:11]
+image_bvh_intersect_ray v[252:255], v[1:11], s[8:11]
// GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
image_msaa_load v14, [v204,v11,v14,v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
index 7d11e8b6e27b6..1f1cdd70e2dfc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
@@ -1248,23 +1248,23 @@ image_atomic_xor v[1:2], v[2:3], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA uno
image_atomic_xor v[254:255], v[254:255], ttmp[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe
// GFX11: [0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00]
-image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11]
+image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11]
// GFX11: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00]
-image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11]
-// GFX11: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00]
+image_bvh64_intersect_ray v[5:8], v[244:255], s[8:11]
+// GFX11: [0x80,0x8f,0x68,0xf0,0xf4,0x05,0x02,0x00]
-image_bvh64_intersect_ray v[5:8], v[1:16], s[100:103] a16
+image_bvh64_intersect_ray v[5:8], v[1:9], s[100:103] a16
// GFX11: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00]
-image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16
-// GFX11: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00]
+image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16
+// GFX11: [0x80,0x8f,0x69,0xf0,0xf7,0xfc,0x1e,0x00]
-image_bvh_intersect_ray v[5:8], v[1:16], s[8:11]
+image_bvh_intersect_ray v[5:8], v[1:11], s[8:11]
// GFX11: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00]
-image_bvh_intersect_ray v[5:8], v[240:255], s[8:11]
-// GFX11: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00]
+image_bvh_intersect_ray v[5:8], v[245:255], s[8:11]
+// GFX11: [0x80,0x8f,0x64,0xf0,0xf5,0x05,0x02,0x00]
image_bvh_intersect_ray v[5:8], v[1:8], s[100:103] a16
// GFX11: [0x80,0x8f,0x65,0xf0,0x01,0x05,0x19,0x00]
@@ -3264,17 +3264,17 @@ image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
image_sample_c_d v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c]
-image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c]
-image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_c_d v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x85,0xf0,0xf1,0x05,0x02,0x0c]
image_sample_c_d v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
// GFX11: [0x04,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
@@ -3336,17 +3336,17 @@ image_sample_c_d_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
image_sample_c_d_cl v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c]
-image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_cl v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x10,0xf1,0xf1,0x05,0x02,0x0c]
-image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_cl v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_cl v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x11,0xf1,0xf1,0x05,0x02,0x0c]
image_sample_c_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
// GFX11: [0x04,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
@@ -3360,11 +3360,11 @@ image_sample_c_d_cl v[5:7], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
image_sample_c_d_cl v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
// GFX11: [0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c]
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
// GFX11: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
-// GFX11: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_cl v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+// GFX11: [0x0c,0x03,0x12,0xf1,0xf1,0xff,0x02,0x0c]
image_sample_c_d_cl v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
// GFX11: [0x0c,0x03,0x13,0xf1,0x01,0x05,0x22,0x0c]
@@ -3384,11 +3384,11 @@ image_sample_c_d_cl v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_
image_sample_c_d_cl v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
// GFX11: [0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c]
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
// GFX11: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
-// GFX11: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_cl v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+// GFX11: [0x14,0x04,0x12,0xf1,0xf1,0xff,0x02,0x0c]
image_sample_c_d_cl v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
// GFX11: [0x14,0x04,0x13,0xf1,0x01,0x05,0x38,0x64]
@@ -3408,11 +3408,11 @@ image_sample_c_d_cl_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_
image_sample_c_d_cl_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c]
-image_sample_c_d_cl_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_cl_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x50,0xf1,0xf1,0x05,0x02,0x0c]
image_sample_c_d_cl_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x51,0xf1,0x01,0x05,0x02,0x0c]
@@ -3480,23 +3480,23 @@ image_sample_c_d_cl_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IM
image_sample_c_d_cl_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o v[5:6], v[1:12], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o v[5:6], v[240:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_cl_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_cl_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
+image_sample_c_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
// GFX11: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v[254:255], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
-// GFX11: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c]
+image_sample_c_d_cl_o v[254:255], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
+// GFX11: [0x04,0x03,0x28,0xf1,0xf1,0xfe,0x02,0x0c]
image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
// GFX11: [0x04,0x03,0x29,0xf1,0x01,0x05,0x22,0x0c]
@@ -3504,10 +3504,10 @@ image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IM
image_sample_c_d_cl_o v[253:255], v[248:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
// GFX11: [0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c]
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
// GFX11: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
// GFX11: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
image_sample_c_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
@@ -3528,10 +3528,10 @@ image_sample_c_d_cl_o v[5:6], v[1:6], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IM
image_sample_c_d_cl_o v[254:255], v[250:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
// GFX11: [0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c]
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
// GFX11: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
// GFX11: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
image_sample_c_d_cl_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
@@ -3552,10 +3552,10 @@ image_sample_c_d_cl_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSR
image_sample_c_d_cl_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c]
-image_sample_c_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o_g16 v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_c_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o_g16 v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c]
image_sample_c_d_cl_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
@@ -3696,16 +3696,16 @@ image_sample_c_d_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1
image_sample_c_d_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c]
-image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_o v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0xac,0xf0,0xf1,0x05,0x02,0x0c]
-image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c]
image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
@@ -3720,11 +3720,11 @@ image_sample_c_d_o v[5:7], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2
image_sample_c_d_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
// GFX11: [0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c]
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
// GFX11: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
-// GFX11: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_o v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+// GFX11: [0x0c,0x03,0xae,0xf0,0xf1,0xff,0x02,0x0c]
image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
// GFX11: [0x0c,0x03,0xaf,0xf0,0x01,0x05,0x22,0x0c]
@@ -3744,11 +3744,11 @@ image_sample_c_d_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1
image_sample_c_d_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
// GFX11: [0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c]
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
// GFX11: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
-// GFX11: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_o v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+// GFX11: [0x14,0x04,0xae,0xf0,0xf1,0xff,0x02,0x0c]
image_sample_c_d_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
// GFX11: [0x14,0x04,0xaf,0xf0,0x01,0x05,0x38,0x64]
@@ -3768,11 +3768,11 @@ image_sample_c_d_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_I
image_sample_c_d_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c]
-image_sample_c_d_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_c_d_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_o_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0xf0,0xf0,0xf1,0x05,0x02,0x0c]
image_sample_c_d_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0xf1,0xf0,0x01,0x05,0x02,0x0c]
@@ -4344,11 +4344,11 @@ image_sample_d v[5:6], v[1:3], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a1
image_sample_d v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c]
-image_sample_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_d v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x70,0xf0,0xf1,0x05,0x02,0x0c]
image_sample_d v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x71,0xf0,0x01,0x05,0x02,0x0c]
@@ -4416,10 +4416,10 @@ image_sample_d_cl v[5:6], v[1:3], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
image_sample_d_cl v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c]
-image_sample_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c]
image_sample_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
@@ -4560,17 +4560,17 @@ image_sample_d_cl_o v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
image_sample_d_cl_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c]
-image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_d_cl_o v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x1c,0xf1,0xf1,0x05,0x02,0x0c]
-image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_d_cl_o v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x1d,0xf1,0xf1,0x05,0x02,0x0c]
image_sample_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
// GFX11: [0x04,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
@@ -4584,11 +4584,11 @@ image_sample_d_cl_o v[5:7], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
image_sample_d_cl_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
// GFX11: [0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c]
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
// GFX11: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
-// GFX11: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_d_cl_o v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+// GFX11: [0x0c,0x03,0x1e,0xf1,0xf1,0xff,0x02,0x0c]
image_sample_d_cl_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
// GFX11: [0x0c,0x03,0x1f,0xf1,0x01,0x05,0x22,0x0c]
@@ -4608,11 +4608,11 @@ image_sample_d_cl_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_
image_sample_d_cl_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
// GFX11: [0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c]
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
// GFX11: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
-// GFX11: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_d_cl_o v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+// GFX11: [0x14,0x04,0x1e,0xf1,0xf1,0xff,0x02,0x0c]
image_sample_d_cl_o v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
// GFX11: [0x14,0x04,0x1f,0xf1,0x01,0x05,0x38,0x64]
@@ -4632,11 +4632,11 @@ image_sample_d_cl_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_
image_sample_d_cl_o_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c]
-image_sample_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c]
-image_sample_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_d_cl_o_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x54,0xf1,0xf1,0x05,0x02,0x0c]
image_sample_d_cl_o_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x55,0xf1,0x01,0x05,0x02,0x0c]
@@ -4776,17 +4776,17 @@ image_sample_d_o v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
image_sample_d_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
// GFX11: [0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c]
-image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
// GFX11: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c]
-image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_d_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
// GFX11: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c]
-image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_d_o v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x99,0xf0,0xf1,0x05,0x02,0x0c]
image_sample_d_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
// GFX11: [0x04,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
index 3c43bb0473711..4c65dc348a578 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
@@ -163,8 +163,8 @@ image_sample_d v[64:66], [v32, v16, v8], s[4:11], s[100:103] dmask:0x7 dim:SQ_RS
image_sample_d v[64:66], v[32:39], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D
// GFX11: image_sample_d v[64:66], v[32:39], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
-image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
-// GFX11: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
+image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
+// GFX11: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY
// GFX11: image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x07,0x70,0xf0,0x20,0x40,0x01,0x64,0x10,0x08,0x04,0x00]
@@ -286,17 +286,17 @@ image_msaa_load v[1:2], v[5:8], s[8:15] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
// GFX11: image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00]
-image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
-// GFX11: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
+image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
+// GFX11: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
// GFX11: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00]
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
-// GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
+image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
+// GFX11: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
-// GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
+image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
+// GFX11: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15]
// GFX11: image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] ; encoding: [0x81,0x8f,0x64,0xf0,0x32,0x27,0x03,0x00,0x2e,0x14,0x28,0x2f]
diff --git a/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
index 632a4fce4cd27..250c00a6321b0 100644
--- a/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
@@ -1848,7 +1848,7 @@ image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x0
image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -1947,7 +1947,7 @@ image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2619,7 +2619,7 @@ image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2712,7 +2712,7 @@ image_sample_c_d_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3381,7 +3381,7 @@ image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3474,7 +3474,7 @@ image_sample_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4137,7 +4137,7 @@ image_sample_c_d_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4230,7 +4230,7 @@ image_sample_c_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6069,7 +6069,7 @@ image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x0
image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -6168,7 +6168,7 @@ image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6264,7 +6264,7 @@ image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6357,7 +6357,7 @@ image_sample_c_cd_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6453,7 +6453,7 @@ image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6546,7 +6546,7 @@ image_sample_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6639,7 +6639,7 @@ image_sample_c_cd_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6732,7 +6732,7 @@ image_sample_c_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
diff --git a/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
index e8f1f6ee04ac8..abcd4b700e583 100644
--- a/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
@@ -1773,7 +1773,7 @@ image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x0
image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -1875,7 +1875,7 @@ image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2568,7 +2568,7 @@ image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2664,7 +2664,7 @@ image_sample_c_d_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3354,7 +3354,7 @@ image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3450,7 +3450,7 @@ image_sample_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4134,7 +4134,7 @@ image_sample_c_d_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4230,7 +4230,7 @@ image_sample_c_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6156,7 +6156,7 @@ image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x0
image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -6258,7 +6258,7 @@ image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6357,7 +6357,7 @@ image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6453,7 +6453,7 @@ image_sample_c_cd_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6552,7 +6552,7 @@ image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6648,7 +6648,7 @@ image_sample_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6744,7 +6744,7 @@ image_sample_c_cd_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6840,7 +6840,7 @@ image_sample_c_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
index 21e1f3dce964a..ed65976380a53 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
@@ -1851,7 +1851,7 @@ image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x0
image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -1956,7 +1956,7 @@ image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2667,7 +2667,7 @@ image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2766,7 +2766,7 @@ image_sample_c_d_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3477,7 +3477,7 @@ image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3576,7 +3576,7 @@ image_sample_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00]
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00]
image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4278,7 +4278,7 @@ image_sample_c_d_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4377,7 +4377,7 @@ image_sample_c_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00]
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00]
image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6399,7 +6399,7 @@ image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x0
image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -6504,7 +6504,7 @@ image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6606,7 +6606,7 @@ image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6705,7 +6705,7 @@ image_sample_c_cd_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6807,7 +6807,7 @@ image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6906,7 +6906,7 @@ image_sample_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00]
-image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00]
image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -7005,7 +7005,7 @@ image_sample_c_cd_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -7104,7 +7104,7 @@ image_sample_c_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
// CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00]
-image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
// CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00]
image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
index 1162710aa6e6b..cdd7eeabf2fb5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
@@ -75,16 +75,16 @@
# GFX10: v_fmac_legacy_f32_e64 v0, s1, 2.0
0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00
-# GFX10: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
+# GFX10: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00
# GFX10: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40
-# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
+# GFX10: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00
-# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
+# GFX10: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40
# GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
index 6778c10078924..8461a5ebfceb8 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
@@ -1248,22 +1248,22 @@
# GFX11: image_atomic_xor v[254:255], v[254:255], ttmp[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe ; encoding: [0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00]
0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00
-# GFX11: image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00]
+# GFX11: image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00]
0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00
-# GFX11: image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00]
+# GFX11: image_bvh64_intersect_ray v[5:8], v[240:251], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00]
0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00
-# GFX11: image_bvh64_intersect_ray v[5:8], v[1:16], s[100:103] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00]
+# GFX11: image_bvh64_intersect_ray v[5:8], v[1:9], s[100:103] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00]
0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00
-# GFX11: image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00]
+# GFX11: image_bvh64_intersect_ray v[252:255], v[240:248], ttmp[12:15] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00]
0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00
-# GFX11: image_bvh_intersect_ray v[5:8], v[1:16], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00]
+# GFX11: image_bvh_intersect_ray v[5:8], v[1:11], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00]
0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00
-# GFX11: image_bvh_intersect_ray v[5:8], v[240:255], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00]
+# GFX11: image_bvh_intersect_ray v[5:8], v[240:250], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00]
0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00
# GFX11: image_bvh_intersect_ray v[5:8], v[1:8], s[100:103] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x01,0x05,0x19,0x00]
@@ -3264,16 +3264,16 @@
# GFX11: image_sample_c_d v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c]
0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c
-# GFX11: image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_c_d v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
@@ -3336,16 +3336,16 @@
# GFX11: image_sample_c_d_cl v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c]
0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_c_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
@@ -3360,10 +3360,10 @@
# GFX11: image_sample_c_d_cl v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c]
0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c
-# GFX11: image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c]
0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c]
0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_c_d_cl v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x13,0xf1,0x01,0x05,0x22,0x0c]
@@ -3384,10 +3384,10 @@
# GFX11: image_sample_c_d_cl v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c]
0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c
-# GFX11: image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c]
0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c]
0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_c_d_cl v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x13,0xf1,0x01,0x05,0x38,0x64]
@@ -3408,10 +3408,10 @@
# GFX11: image_sample_c_d_cl_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c]
0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x51,0xf1,0x01,0x05,0x02,0x0c]
@@ -3480,22 +3480,22 @@
# GFX11: image_sample_c_d_cl_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c]
0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[1:12], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[240:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v[254:255], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[254:255], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c]
0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c
# GFX11: image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x29,0xf1,0x01,0x05,0x22,0x0c]
@@ -3504,10 +3504,10 @@
# GFX11: image_sample_c_d_cl_o v[253:255], v[248:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c]
0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c
-# GFX11: image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c]
0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_c_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x2b,0xf1,0x01,0x05,0x22,0x0c]
@@ -3528,10 +3528,10 @@
# GFX11: image_sample_c_d_cl_o v[254:255], v[250:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c]
0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c
-# GFX11: image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c]
0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_c_d_cl_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x2b,0xf1,0x01,0x05,0x38,0x64]
@@ -3552,10 +3552,10 @@
# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c]
0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x59,0xf1,0x01,0x05,0x02,0x0c]
@@ -3696,16 +3696,16 @@
# GFX11: image_sample_c_d_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c]
0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
@@ -3720,10 +3720,10 @@
# GFX11: image_sample_c_d_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c]
0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c
-# GFX11: image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c]
0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_o v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c]
0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0xaf,0xf0,0x01,0x05,0x22,0x0c]
@@ -3744,10 +3744,10 @@
# GFX11: image_sample_c_d_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c]
0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c
-# GFX11: image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c]
0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_o v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c]
0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_c_d_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0xaf,0xf0,0x01,0x05,0x38,0x64]
@@ -3768,10 +3768,10 @@
# GFX11: image_sample_c_d_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c]
0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_c_d_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xf1,0xf0,0x01,0x05,0x02,0x0c]
@@ -4344,10 +4344,10 @@
# GFX11: image_sample_d v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c]
0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c
-# GFX11: image_sample_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_d v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x71,0xf0,0x01,0x05,0x02,0x0c]
@@ -4416,10 +4416,10 @@
# GFX11: image_sample_d_cl v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c]
0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x05,0xf1,0x01,0x05,0x02,0x0c]
@@ -4560,16 +4560,16 @@
# GFX11: image_sample_d_cl_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c]
0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
@@ -4584,10 +4584,10 @@
# GFX11: image_sample_d_cl_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c]
0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c
-# GFX11: image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c]
0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_d_cl_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x1f,0xf1,0x01,0x05,0x22,0x0c]
@@ -4608,10 +4608,10 @@
# GFX11: image_sample_d_cl_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c]
0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c
-# GFX11: image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c]
0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c
# GFX11: image_sample_d_cl_o v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x1f,0xf1,0x01,0x05,0x38,0x64]
@@ -4632,10 +4632,10 @@
# GFX11: image_sample_d_cl_o_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c]
0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c]
0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x55,0xf1,0x01,0x05,0x02,0x0c]
@@ -4776,16 +4776,16 @@
# GFX11: image_sample_d_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c]
0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c
-# GFX11: image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c
-# GFX11: image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c]
0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c
-# GFX11: image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c]
0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c
# GFX11: image_sample_d_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
index cf05f42a0ef51..240b8a37b273d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
@@ -162,7 +162,7 @@
# GFX11: image_sample_d v[64:66], v[32:37], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64
-# GFX11: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
+# GFX11: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64
# GFX11: image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x07,0x70,0xf0,0x20,0x40,0x01,0x64,0x10,0x08,0x04,0x00]
@@ -282,16 +282,16 @@
# GFX11: image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00]
0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00
-# GFX11: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
+# GFX11: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00
# GFX11: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00]
0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00
-# GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
+# GFX11: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00
-# GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
+# GFX11: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00
# GFX11: image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] ; encoding: [0x81,0x8f,0x64,0xf0,0x32,0x27,0x03,0x00,0x2e,0x14,0x28,0x2f]
More information about the llvm-commits
mailing list