[llvm] 595a088 - [AMDGPU] Add support for new LLVM vector types

Mateja Marjanovic via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 29 08:02:51 PST 2022


Author: Mateja Marjanovic
Date: 2022-11-29T17:02:04+01:00
New Revision: 595a08847a4b6e8d52c40715e2fa03e3d7f73189

URL: https://github.com/llvm/llvm-project/commit/595a08847a4b6e8d52c40715e2fa03e3d7f73189
DIFF: https://github.com/llvm/llvm-project/commit/595a08847a4b6e8d52c40715e2fa03e3d7f73189.diff

LOG: [AMDGPU] Add support for new LLVM vector types

Add VReg, AReg and SReg on AMDGPU for bit widths: 288, 320, 352 and 384.

Differential Revision: https://reviews.llvm.org/D138205

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
    llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
    llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
    llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
    llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
    llvm/lib/Target/AMDGPU/MIMGInstructions.td
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
    llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
    llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
    llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
    llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
    llvm/test/Analysis/CostModel/AMDGPU/fma.ll
    llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
    llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
    llvm/test/Analysis/CostModel/AMDGPU/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
    llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
    llvm/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
    llvm/test/CodeGen/AMDGPU/load-global-f32.ll
    llvm/test/CodeGen/AMDGPU/load-global-i32.ll
    llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
    llvm/test/MC/AMDGPU/gfx1013.s
    llvm/test/MC/AMDGPU/gfx1030_new.s
    llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
    llvm/test/MC/AMDGPU/gfx10_unsupported.s
    llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
    llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
    llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
    llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
    llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
    llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
    llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
    llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index bfeee37feb4bd..d6a94c972340e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -34,12 +34,24 @@ enum PartialMappingIdx {
   PM_SGPR96 = 23,
   PM_VGPR96 = 24,
   PM_AGPR96 = 25,
-  PM_AGPR32 = 31,
-  PM_AGPR64 = 32,
-  PM_AGPR128 = 33,
-  PM_AGPR256 = 34,
-  PM_AGPR512 = 35,
-  PM_AGPR1024 = 36
+  PM_SGPR288 = 26,
+  PM_VGPR288 = 27,
+  PM_AGPR288 = 28,
+  PM_SGPR320 = 29,
+  PM_VGPR320 = 30,
+  PM_AGPR320 = 31,
+  PM_SGPR352 = 32,
+  PM_VGPR352 = 33,
+  PM_AGPR352 = 34,
+  PM_SGPR384 = 35,
+  PM_VGPR384 = 36,
+  PM_AGPR384 = 37,
+  PM_AGPR32 = 38,
+  PM_AGPR64 = 39,
+  PM_AGPR128 = 40,
+  PM_AGPR256 = 41,
+  PM_AGPR512 = 42,
+  PM_AGPR1024 = 43
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -66,6 +78,18 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
   {0, 96, SGPRRegBank},
   {0, 96, VGPRRegBank},
   {0, 96, AGPRRegBank},
+  {0, 288, SGPRRegBank},
+  {0, 288, VGPRRegBank},
+  {0, 288, AGPRRegBank},
+  {0, 320, SGPRRegBank},
+  {0, 320, VGPRRegBank},
+  {0, 320, AGPRRegBank},
+  {0, 352, SGPRRegBank},
+  {0, 352, VGPRRegBank},
+  {0, 352, AGPRRegBank},
+  {0, 384, SGPRRegBank},
+  {0, 384, VGPRRegBank},
+  {0, 384, AGPRRegBank},
 
   {0, 32, AGPRRegBank}, // AGPR begin
   {0, 64, AGPRRegBank},
@@ -107,6 +131,18 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
   {&PartMappings[17], 1},
   {&PartMappings[18], 1},
   {&PartMappings[19], 1},
+  {&PartMappings[20], 1},
+  {&PartMappings[21], 1},
+  {&PartMappings[22], 1},
+  {&PartMappings[23], 1},
+  {&PartMappings[24], 1},
+  {&PartMappings[25], 1},
+  {&PartMappings[26], 1},
+  {&PartMappings[27], 1},
+  {&PartMappings[28], 1},
+  {&PartMappings[29], 1},
+  {&PartMappings[30], 1},
+  {&PartMappings[31], 1},
 
   // AGPRs
   {nullptr, 0},
@@ -114,12 +150,12 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
   {nullptr, 0},
   {nullptr, 0},
   {nullptr, 0},
-  {&PartMappings[20], 1}, // 32
-  {&PartMappings[21], 1}, // 64
-  {&PartMappings[22], 1}, // 128
-  {&PartMappings[23], 1}, // 256
-  {&PartMappings[24], 1}, // 512
-  {&PartMappings[25], 1}  // 1024
+  {&PartMappings[32], 1}, // 32
+  {&PartMappings[33], 1}, // 64
+  {&PartMappings[34], 1}, // 128
+  {&PartMappings[35], 1}, // 256
+  {&PartMappings[36], 1}, // 512
+  {&PartMappings[37], 1}  // 1024
 };
 
 const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
@@ -148,7 +184,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
 enum ValueMappingIdx {
   SGPRStartIdx = 1,
   VGPRStartIdx = 12,
-  AGPRStartIdx = 26
+  AGPRStartIdx = 38
 };
 
 const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
@@ -175,6 +211,62 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
       default: llvm_unreachable("Invalid register bank");
     }
     break;
+  case 288:
+    switch (BankID) {
+      case AMDGPU::VGPRRegBankID:
+        Idx = PM_VGPR288;
+        break;
+      case AMDGPU::SGPRRegBankID:
+        Idx = PM_SGPR288;
+        break;
+      case AMDGPU::AGPRRegBankID:
+        Idx = PM_AGPR288;
+        break;
+      default: llvm_unreachable("Invalid register bank");
+    }
+    break;
+  case 320:
+    switch (BankID) {
+      case AMDGPU::VGPRRegBankID:
+        Idx = PM_VGPR320;
+        break;
+      case AMDGPU::SGPRRegBankID:
+        Idx = PM_SGPR320;
+        break;
+      case AMDGPU::AGPRRegBankID:
+        Idx = PM_AGPR320;
+        break;
+      default: llvm_unreachable("Invalid register bank");
+    }
+    break;
+  case 352:
+    switch (BankID) {
+      case AMDGPU::VGPRRegBankID:
+        Idx = PM_VGPR352;
+        break;
+      case AMDGPU::SGPRRegBankID:
+        Idx = PM_SGPR352;
+        break;
+      case AMDGPU::AGPRRegBankID:
+        Idx = PM_AGPR352;
+        break;
+      default: llvm_unreachable("Invalid register bank");
+    }
+    break;
+  case 384:
+    switch (BankID) {
+      case AMDGPU::VGPRRegBankID:
+        Idx = PM_VGPR384;
+        break;
+      case AMDGPU::SGPRRegBankID:
+        Idx = PM_SGPR384;
+        break;
+      case AMDGPU::AGPRRegBankID:
+        Idx = PM_AGPR384;
+        break;
+      default: llvm_unreachable("Invalid register bank");
+    }
+    break;
   default:
     switch (BankID) {
       case AMDGPU::VGPRRegBankID:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fb4bd6b1a0bb6..e7fb4a76b6a29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -84,6 +84,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
 
+  setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
+
+  setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
+
+  setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
+
+  setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
+
   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
@@ -196,6 +208,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
 
+  setOperationAction(ISD::STORE, MVT::v9f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
+
+  setOperationAction(ISD::STORE, MVT::v10f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
+
+  setOperationAction(ISD::STORE, MVT::v11f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
+
+  setOperationAction(ISD::STORE, MVT::v12f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
+
   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 
@@ -325,19 +349,23 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
   setOperationAction(ISD::CONCAT_VECTORS,
-                     {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
-                      MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
-                      MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+                     {MVT::v3i32,  MVT::v3f32,  MVT::v4i32,  MVT::v4f32,
+                      MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,
+                      MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,
+                      MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,
+                      MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
                      Custom);
   setOperationAction(
       ISD::EXTRACT_SUBVECTOR,
       {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32,
        MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32,
        MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32,
-       MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v16f16, MVT::v16i16,
-       MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
-       MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,  MVT::v4i64,
-       MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64},
+       MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,  MVT::v9i32,
+       MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
+       MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
+       MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
+       MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
+       MVT::v16f64, MVT::v16i64},
       Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -384,7 +412,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       MVT::i64, Custom);
 
   static const MVT::SimpleValueType VectorIntTypes[] = {
-      MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
+      MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
+      MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
 
   for (MVT VT : VectorIntTypes) {
     // Expand the following operations for the current type by default.
@@ -404,7 +433,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   }
 
   static const MVT::SimpleValueType FloatVectorTypes[] = {
-      MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
+      MVT::v2f32, MVT::v3f32,  MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
+      MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
 
   for (MVT VT : FloatVectorTypes) {
     setOperationAction(
@@ -440,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
   AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
 
+  setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
+
+  setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
+
+  setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
+
+  setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
+
   // There are no libcalls of any kind.
   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -1064,7 +1106,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
       // Round up vec3/vec5 argument.
       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
         assert(MemVT.getVectorNumElements() == 3 ||
-               MemVT.getVectorNumElements() == 5);
+               MemVT.getVectorNumElements() == 5 ||
+               (MemVT.getVectorNumElements() >= 9 &&
+                MemVT.getVectorNumElements() <= 12));
         MemVT = MemVT.getPow2VectorType(State.getContext());
       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
         MemVT = MemVT.getRoundIntegerType(State.getContext());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 08a82d2f7a2d0..c148c322be151 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5446,7 +5446,7 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
     Opcode = AMDGPU::getMIMGOpcode(
         BaseOpcodes[Is64][IsA16],
         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
-        NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
+        NumVDataDwords, NumVAddrDwords);
   }
   assert(Opcode != -1);
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 50999a4802b39..e83e644d13f3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 def SGPRRegBank : RegisterBank<"SGPR",
-  [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
+  [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_288, SReg_320, SReg_352, SReg_384, SReg_512, SReg_1024]
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
+  [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
 def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
 
 def AGPRRegBank : RegisterBank <"AGPR",
-  [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
+  [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024]
 >;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 547ca652e57f5..01cb14714ad98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -431,6 +431,46 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           IsSGPR = false;
           IsAGPR = true;
           Width = 8;
+        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 9;
+        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 9;
+        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 9;
+        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 10;
+        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 10;
+        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 10;
+        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 11;
+        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 11;
+        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 11;
+        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 12;
+        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 12;
+        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 12;
         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
                  "trap handler registers should not be used");

diff  --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3c6050cba7d56..617b5bb2d859f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2360,6 +2360,14 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
         return AMDGPU::VReg_224RegClassID;
       case 256:
         return AMDGPU::VReg_256RegClassID;
+      case 288:
+        return AMDGPU::VReg_288RegClassID;
+      case 320:
+        return AMDGPU::VReg_320RegClassID;
+      case 352:
+        return AMDGPU::VReg_352RegClassID;
+      case 384:
+        return AMDGPU::VReg_384RegClassID;
       case 512:
         return AMDGPU::VReg_512RegClassID;
       case 1024:
@@ -2398,6 +2406,14 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
         return AMDGPU::SGPR_224RegClassID;
       case 256:
         return AMDGPU::SGPR_256RegClassID;
+      case 288:
+        return AMDGPU::SGPR_288RegClassID;
+      case 320:
+        return AMDGPU::SGPR_320RegClassID;
+      case 352:
+        return AMDGPU::SGPR_352RegClassID;
+      case 384:
+        return AMDGPU::SGPR_384RegClassID;
       case 512:
         return AMDGPU::SGPR_512RegClassID;
     }
@@ -2420,6 +2436,14 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
         return AMDGPU::AReg_224RegClassID;
       case 256:
         return AMDGPU::AReg_256RegClassID;
+      case 288:
+        return AMDGPU::AReg_288RegClassID;
+      case 320:
+        return AMDGPU::AReg_320RegClassID;
+      case 352:
+        return AMDGPU::AReg_352RegClassID;
+      case 384:
+        return AMDGPU::AReg_384RegClassID;
       case 512:
         return AMDGPU::AReg_512RegClassID;
       case 1024:
@@ -3684,7 +3708,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
       AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
 
   if (!IsNSA) {
-    if (ExpectedAddrSize > 8)
+    if (ExpectedAddrSize > 12)
       ExpectedAddrSize = 16;
 
     // Allow oversized 8 VGPR vaddr when only 5/6/7 VGPRs are required.

diff  --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ef084cf74a975..8cffbd973317f 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -129,6 +129,9 @@ DECODE_OPERAND_REG(VReg_64)
 DECODE_OPERAND_REG(VReg_96)
 DECODE_OPERAND_REG(VReg_128)
 DECODE_OPERAND_REG(VReg_256)
+DECODE_OPERAND_REG(VReg_288)
+DECODE_OPERAND_REG(VReg_352)
+DECODE_OPERAND_REG(VReg_384)
 DECODE_OPERAND_REG(VReg_512)
 DECODE_OPERAND_REG(VReg_1024)
 
@@ -919,7 +922,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
             Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
     if (!IsNSA) {
-      if (AddrSize > 8)
+      if (AddrSize > 12)
         AddrSize = 16;
     } else {
       if (AddrSize > Info->VAddrDwords) {
@@ -1129,6 +1132,14 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
   case AMDGPU::TTMP_256RegClassID:
     // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
   // this bundle?
+  case AMDGPU::SGPR_288RegClassID:
+  case AMDGPU::TTMP_288RegClassID:
+  case AMDGPU::SGPR_320RegClassID:
+  case AMDGPU::TTMP_320RegClassID:
+  case AMDGPU::SGPR_352RegClassID:
+  case AMDGPU::TTMP_352RegClassID:
+  case AMDGPU::SGPR_384RegClassID:
+  case AMDGPU::TTMP_384RegClassID:
   case AMDGPU::SGPR_512RegClassID:
   case AMDGPU::TTMP_512RegClassID:
     shift = 2;
@@ -1204,6 +1215,23 @@ MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const {
   return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_288(unsigned Val) const {
+  return createRegOperand(AMDGPU::AReg_288RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_320(unsigned Val) const {
+  return createRegOperand(AMDGPU::AReg_320RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_352(unsigned Val) const {
+  return createRegOperand(AMDGPU::AReg_352RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_384(unsigned Val) const {
+  return createRegOperand(AMDGPU::AReg_384RegClassID, Val & 255);
+}
+
+
 MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
   return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
 }
@@ -1252,6 +1280,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const {
   return createRegOperand(AMDGPU::VReg_256RegClassID, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_288(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_288RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_320(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_320RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_352(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_352RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_384(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_384RegClassID, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
   return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
 }
@@ -1302,6 +1346,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
   return decodeDstOp(OPW256, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_288(unsigned Val) const {
+  return decodeDstOp(OPW288, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_320(unsigned Val) const {
+  return decodeDstOp(OPW320, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_352(unsigned Val) const {
+  return decodeDstOp(OPW352, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_384(unsigned Val) const {
+  return decodeDstOp(OPW384, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
   return decodeDstOp(OPW512, Val);
 }
@@ -1460,6 +1520,10 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
   case OPW128: return VReg_128RegClassID;
   case OPW160: return VReg_160RegClassID;
   case OPW256: return VReg_256RegClassID;
+  case OPW288: return VReg_288RegClassID;
+  case OPW320: return VReg_320RegClassID;
+  case OPW352: return VReg_352RegClassID;
+  case OPW384: return VReg_384RegClassID;
   case OPW512: return VReg_512RegClassID;
   case OPW1024: return VReg_1024RegClassID;
   }
@@ -1481,6 +1545,10 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
   case OPW128: return AReg_128RegClassID;
   case OPW160: return AReg_160RegClassID;
   case OPW256: return AReg_256RegClassID;
+  case OPW288: return AReg_288RegClassID;
+  case OPW320: return AReg_320RegClassID;
+  case OPW352: return AReg_352RegClassID;
+  case OPW384: return AReg_384RegClassID;
   case OPW512: return AReg_512RegClassID;
   case OPW1024: return AReg_1024RegClassID;
   }
@@ -1503,6 +1571,10 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
   case OPW128: return SGPR_128RegClassID;
   case OPW160: return SGPR_160RegClassID;
   case OPW256: return SGPR_256RegClassID;
+  case OPW288: return SGPR_288RegClassID;
+  case OPW320: return SGPR_320RegClassID;
+  case OPW352: return SGPR_352RegClassID;
+  case OPW384: return SGPR_384RegClassID;
   case OPW512: return SGPR_512RegClassID;
   }
 }
@@ -1521,6 +1593,10 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
   case OPWV232: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
   case OPW256: return TTMP_256RegClassID;
+  case OPW288: return TTMP_288RegClassID;
+  case OPW320: return TTMP_320RegClassID;
+  case OPW352: return TTMP_352RegClassID;
+  case OPW384: return TTMP_384RegClassID;
   case OPW512: return TTMP_512RegClassID;
   }
 }

diff  --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d0aef9cdf79da..b811d70bc108b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -182,6 +182,10 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeOperand_VReg_96(unsigned Val) const;
   MCOperand decodeOperand_VReg_128(unsigned Val) const;
   MCOperand decodeOperand_VReg_256(unsigned Val) const;
+  MCOperand decodeOperand_VReg_288(unsigned Val) const;
+  MCOperand decodeOperand_VReg_320(unsigned Val) const;
+  MCOperand decodeOperand_VReg_352(unsigned Val) const;
+  MCOperand decodeOperand_VReg_384(unsigned Val) const;
   MCOperand decodeOperand_VReg_512(unsigned Val) const;
   MCOperand decodeOperand_VReg_1024(unsigned Val) const;
 
@@ -193,12 +197,20 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
   MCOperand decodeOperand_SReg_128(unsigned Val) const;
   MCOperand decodeOperand_SReg_256(unsigned Val) const;
+  MCOperand decodeOperand_SReg_288(unsigned Val) const;
+  MCOperand decodeOperand_SReg_320(unsigned Val) const;
+  MCOperand decodeOperand_SReg_352(unsigned Val) const;
+  MCOperand decodeOperand_SReg_384(unsigned Val) const;
   MCOperand decodeOperand_SReg_512(unsigned Val) const;
 
   MCOperand decodeOperand_AGPR_32(unsigned Val) const;
   MCOperand decodeOperand_AReg_64(unsigned Val) const;
   MCOperand decodeOperand_AReg_128(unsigned Val) const;
   MCOperand decodeOperand_AReg_256(unsigned Val) const;
+  MCOperand decodeOperand_AReg_288(unsigned Val) const;
+  MCOperand decodeOperand_AReg_320(unsigned Val) const;
+  MCOperand decodeOperand_AReg_352(unsigned Val) const;
+  MCOperand decodeOperand_AReg_384(unsigned Val) const;
   MCOperand decodeOperand_AReg_512(unsigned Val) const;
   MCOperand decodeOperand_AReg_1024(unsigned Val) const;
   MCOperand decodeOperand_AV_32(unsigned Val) const;
@@ -214,6 +226,10 @@ class AMDGPUDisassembler : public MCDisassembler {
     OPW128,
     OPW160,
     OPW256,
+    OPW288,
+    OPW320,
+    OPW352,
+    OPW384,
     OPW512,
     OPW1024,
     OPW16,

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 05588cfc45241..d3aee55c5f35c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -503,6 +503,10 @@ void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
       MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
       MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
     Enc |= 512;

diff  --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 533af659116ee..d018fd88b71a1 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -958,7 +958,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> {
                            !if(!eq(NumWords, 6), VReg_192,
                            !if(!eq(NumWords, 7), VReg_224,
                            !if(!le(NumWords, 8), VReg_256,
-                           !if(!le(NumWords, 16), VReg_512, ?))))))))));
+                           !if(!le(NumWords, 9), VReg_288,
+                           !if(!le(NumWords, 10), VReg_320,
+                           !if(!le(NumWords, 11), VReg_352,
+                           !if(!le(NumWords, 12), VReg_384,
+                           !if(!le(NumWords, 16), VReg_512, ?))))))))))))));
 
   // Whether the instruction variant with this vaddr size should be enabled for
   // the auto-generated disassembler.
@@ -1007,8 +1011,8 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> {
            !foreach(range,
                     // V4 is generated for V3 and V4
                     // V8 is generated for V5 through V8
-                    // V16 is generated for V9 through V16
-                    [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9,16]],
+                    // V16 is generated for V13 through V16
+                    [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9],[10],[11],[12],[13,16]],
                     MIMGAddrSizes_dw_range<range>),
            lhs, dw,
            !if(isRangeInList<dw.Min, dw.Max, AllNumAddrWords>.ret,

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 198dee022532c..1cba1e446ed44 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -120,6 +120,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
   addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
 
+  addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
+  addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
+
+  addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
+  addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
+
+  addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
+  addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
+
+  addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
+  addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
+
   addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
   addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
 
@@ -158,15 +170,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD,
-                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
-                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
-                      MVT::v32i32},
+                     {MVT::v2i32,  MVT::v3i32,  MVT::v4i32,  MVT::v5i32,
+                      MVT::v6i32,  MVT::v7i32,  MVT::v8i32,  MVT::v9i32,
+                      MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
+                      MVT::i1,     MVT::v32i32},
                      Custom);
 
   setOperationAction(ISD::STORE,
-                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
-                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
-                      MVT::v32i32},
+                     {MVT::v2i32,  MVT::v3i32,  MVT::v4i32,  MVT::v5i32,
+                      MVT::v6i32,  MVT::v7i32,  MVT::v8i32,  MVT::v9i32,
+                      MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
+                      MVT::i1,     MVT::v32i32},
                      Custom);
 
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
@@ -209,12 +223,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
 
   setOperationAction(ISD::TRUNCATE,
-                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
-                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32},
+                     {MVT::v2i32,  MVT::v3i32,  MVT::v4i32,  MVT::v5i32,
+                      MVT::v6i32,  MVT::v7i32,  MVT::v8i32,  MVT::v9i32,
+                      MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
                      Expand);
   setOperationAction(ISD::FP_ROUND,
-                     {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
-                      MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32},
+                     {MVT::v2f32,  MVT::v3f32,  MVT::v4f32,  MVT::v5f32,
+                      MVT::v6f32,  MVT::v7f32,  MVT::v8f32,  MVT::v9f32,
+                      MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
                      Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG,
@@ -240,11 +256,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT :
-       {MVT::v8i32,  MVT::v8f32,  MVT::v16i32, MVT::v16f32, MVT::v2i64,
-        MVT::v2f64,  MVT::v4i16,  MVT::v4f16,  MVT::v3i64,  MVT::v3f64,
-        MVT::v6i32,  MVT::v6f32,  MVT::v4i64,  MVT::v4f64,  MVT::v8i64,
-        MVT::v8f64,  MVT::v8i16,  MVT::v8f16,  MVT::v16i16, MVT::v16f16,
-        MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
+       {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,   MVT::v9f32,  MVT::v10i32,
+        MVT::v10f32, MVT::v11i32, MVT::v11f32,  MVT::v12i32, MVT::v12f32,
+        MVT::v16i32, MVT::v16f32, MVT::v2i64,   MVT::v2f64,  MVT::v4i16,
+        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,   MVT::v6i32,  MVT::v6f32,
+        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,   MVT::v8f64,  MVT::v8i16,
+        MVT::v8f16,  MVT::v16i16, MVT::v16f16,  MVT::v16i64, MVT::v16f64,
+        MVT::v32i32, MVT::v32f32}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -365,8 +383,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // Deal with vec5/6/7 vector operations when widened to vec8.
   setOperationAction(ISD::INSERT_SUBVECTOR,
-                     {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
-                      MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+                     {MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,
+                      MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,
+                      MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,
+                      MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
                      Custom);
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
@@ -4235,6 +4255,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::SI_INDIRECT_SRC_V2:
   case AMDGPU::SI_INDIRECT_SRC_V4:
   case AMDGPU::SI_INDIRECT_SRC_V8:
+  case AMDGPU::SI_INDIRECT_SRC_V9:
+  case AMDGPU::SI_INDIRECT_SRC_V10:
+  case AMDGPU::SI_INDIRECT_SRC_V11:
+  case AMDGPU::SI_INDIRECT_SRC_V12:
   case AMDGPU::SI_INDIRECT_SRC_V16:
   case AMDGPU::SI_INDIRECT_SRC_V32:
     return emitIndirectSrc(MI, *BB, *getSubtarget());
@@ -4242,6 +4266,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::SI_INDIRECT_DST_V2:
   case AMDGPU::SI_INDIRECT_DST_V4:
   case AMDGPU::SI_INDIRECT_DST_V8:
+  case AMDGPU::SI_INDIRECT_DST_V9:
+  case AMDGPU::SI_INDIRECT_DST_V10:
+  case AMDGPU::SI_INDIRECT_DST_V11:
+  case AMDGPU::SI_INDIRECT_DST_V12:
   case AMDGPU::SI_INDIRECT_DST_V16:
   case AMDGPU::SI_INDIRECT_DST_V32:
     return emitIndirectDst(MI, *BB, *getSubtarget());
@@ -6185,7 +6213,7 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
   MVT Type;
   unsigned NumElts = Elts.size();
 
-  if (NumElts <= 8) {
+  if (NumElts <= 12) {
     Type = MVT::getVectorVT(MVT::f32, NumElts);
   } else {
     assert(Elts.size() <= 16);
@@ -7735,7 +7763,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
           AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
                                 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
                                             : AMDGPU::MIMGEncGfx10Default,
-                                NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
+                                NumVDataDwords, NumVAddrDwords);
     }
     assert(Opcode != -1);
 
@@ -7801,13 +7829,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     if (!UseNSA) {
       // Build a single vector containing all the operands so far prepared.
-      if (NumVAddrDwords > 8) {
+      if (NumVAddrDwords > 12) {
         SDValue Undef = DAG.getUNDEF(MVT::i32);
         Ops.append(16 - Ops.size(), Undef);
       }
-      assert(Ops.size() == 8 || Ops.size() == 16);
+      assert(Ops.size() >= 8 && Ops.size() <= 12);
       SDValue MergedOps = DAG.getBuildVector(
-          Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
+          MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
       Ops.clear();
       Ops.push_back(MergedOps);
     }
@@ -12466,6 +12494,14 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
     return AMDGPU::VReg_224_Align2RegClassID;
   case AMDGPU::VReg_256RegClassID:
     return AMDGPU::VReg_256_Align2RegClassID;
+  case AMDGPU::VReg_288RegClassID:
+    return AMDGPU::VReg_288_Align2RegClassID;
+  case AMDGPU::VReg_320RegClassID:
+    return AMDGPU::VReg_320_Align2RegClassID;
+  case AMDGPU::VReg_352RegClassID:
+    return AMDGPU::VReg_352_Align2RegClassID;
+  case AMDGPU::VReg_384RegClassID:
+    return AMDGPU::VReg_384_Align2RegClassID;
   case AMDGPU::VReg_512RegClassID:
     return AMDGPU::VReg_512_Align2RegClassID;
   case AMDGPU::VReg_1024RegClassID:

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9da08ffaa08ed..6746a9174550e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1300,6 +1300,14 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
     if (VecSize <= 256) // 32 bytes
       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
+    if (VecSize <= 288) // 36 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
+    if (VecSize <= 320) // 40 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
+    if (VecSize <= 352) // 44 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
+    if (VecSize <= 384) // 48 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
     if (VecSize <= 512) // 64 bytes
       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
     if (VecSize <= 1024) // 128 bytes
@@ -1320,6 +1328,14 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
   if (VecSize <= 256) // 32 bytes
     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
+  if (VecSize <= 288) // 36 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
+  if (VecSize <= 320) // 40 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
+  if (VecSize <= 352) // 44 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
+  if (VecSize <= 384) // 48 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
   if (VecSize <= 512) // 64 bytes
     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
   if (VecSize <= 1024) // 128 bytes
@@ -1341,6 +1357,14 @@ static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
   if (VecSize <= 256) // 32 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
+  if (VecSize <= 288) // 36 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
+  if (VecSize <= 320) // 40 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
+  if (VecSize <= 352) // 44 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
+  if (VecSize <= 384) // 48 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
   if (VecSize <= 512) // 64 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
   if (VecSize <= 1024) // 128 bytes
@@ -1421,6 +1445,14 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_S224_SAVE;
   case 32:
     return AMDGPU::SI_SPILL_S256_SAVE;
+  case 36:
+    return AMDGPU::SI_SPILL_S288_SAVE;
+  case 40:
+    return AMDGPU::SI_SPILL_S320_SAVE;
+  case 44:
+    return AMDGPU::SI_SPILL_S352_SAVE;
+  case 48:
+    return AMDGPU::SI_SPILL_S384_SAVE;
   case 64:
     return AMDGPU::SI_SPILL_S512_SAVE;
   case 128:
@@ -1448,6 +1480,14 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V224_SAVE;
   case 32:
     return AMDGPU::SI_SPILL_V256_SAVE;
+  case 36:
+    return AMDGPU::SI_SPILL_S288_SAVE;
+  case 40:
+    return AMDGPU::SI_SPILL_S320_SAVE;
+  case 44:
+    return AMDGPU::SI_SPILL_S352_SAVE;
+  case 48:
+    return AMDGPU::SI_SPILL_S384_SAVE;
   case 64:
     return AMDGPU::SI_SPILL_V512_SAVE;
   case 128:
@@ -1588,6 +1628,14 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_S224_RESTORE;
   case 32:
     return AMDGPU::SI_SPILL_S256_RESTORE;
+  case 36:
+    return AMDGPU::SI_SPILL_S288_RESTORE;
+  case 40:
+    return AMDGPU::SI_SPILL_S320_RESTORE;
+  case 44:
+    return AMDGPU::SI_SPILL_S352_RESTORE;
+  case 48:
+    return AMDGPU::SI_SPILL_S384_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_S512_RESTORE;
   case 128:
@@ -1615,6 +1663,14 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V224_RESTORE;
   case 32:
     return AMDGPU::SI_SPILL_V256_RESTORE;
+  case 36:
+    return AMDGPU::SI_SPILL_V288_RESTORE;
+  case 40:
+    return AMDGPU::SI_SPILL_V320_RESTORE;
+  case 44:
+    return AMDGPU::SI_SPILL_V352_RESTORE;
+  case 48:
+    return AMDGPU::SI_SPILL_V384_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_V512_RESTORE;
   case 128:
@@ -1642,6 +1698,14 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_A224_RESTORE;
   case 32:
     return AMDGPU::SI_SPILL_A256_RESTORE;
+  case 36:
+    return AMDGPU::SI_SPILL_A288_RESTORE;
+  case 40:
+    return AMDGPU::SI_SPILL_A320_RESTORE;
+  case 44:
+    return AMDGPU::SI_SPILL_A352_RESTORE;
+  case 48:
+    return AMDGPU::SI_SPILL_A384_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_A512_RESTORE;
   case 128:
@@ -1669,6 +1733,14 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_AV224_RESTORE;
   case 32:
     return AMDGPU::SI_SPILL_AV256_RESTORE;
+  case 36:
+    return AMDGPU::SI_SPILL_AV288_RESTORE;
+  case 40:
+    return AMDGPU::SI_SPILL_AV320_RESTORE;
+  case 44:
+    return AMDGPU::SI_SPILL_AV352_RESTORE;
+  case 48:
+    return AMDGPU::SI_SPILL_AV384_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_AV512_RESTORE;
   case 128:
@@ -1974,6 +2046,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
@@ -2025,6 +2101,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
     assert(ST.useVGPRIndexMode());
@@ -2064,6 +2144,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
     assert(ST.useVGPRIndexMode());
@@ -4531,7 +4615,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       } else {
         const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
         VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
-        if (AddrWords > 8)
+        if (AddrWords > 12)
           AddrWords = 16;
       }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0516547e179c3..6c9f38c250f08 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -650,6 +650,10 @@ def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
 def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
 def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
 def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>;
+def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>;
+def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>;
+def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>;
 def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
 def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
 
@@ -657,6 +661,10 @@ def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>;
+def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>;
+def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>;
+def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>;
 def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
 
@@ -698,6 +706,10 @@ def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
 
@@ -735,6 +747,10 @@ def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VR
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
 
@@ -751,6 +767,10 @@ def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
 
@@ -784,6 +804,10 @@ defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
 defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
 defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>;
+defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>;
+defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>;
+defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
 
@@ -828,6 +852,10 @@ defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
 defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
 defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
 defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>;
+defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>;
+defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>;
+defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>;
 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
 
@@ -839,6 +867,10 @@ defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
 defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
 defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
 defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
+defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>;
+defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>;
+defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>;
+defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>;
 defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
 defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
 
@@ -850,6 +882,10 @@ defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
 defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
 defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
 defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
+defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>;
+defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>;
+defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>;
+defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
 defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
 defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
 
@@ -1225,6 +1261,70 @@ foreach Index = 0-7 in {
   >;
 }
 
+foreach Index = 0-8 in {
+  def Extract_Element_v9i32_#Index : Extract_Element <
+    i32, v9i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v9i32_#Index : Insert_Element <
+    i32, v9i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v9f32_#Index : Extract_Element <
+    f32, v9f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v9f32_#Index : Insert_Element <
+    f32, v9f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-9 in {
+  def Extract_Element_v10i32_#Index : Extract_Element <
+    i32, v10i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v10i32_#Index : Insert_Element <
+    i32, v10i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v10f32_#Index : Extract_Element <
+    f32, v10f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v10f32_#Index : Insert_Element <
+    f32, v10f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-10 in {
+  def Extract_Element_v11i32_#Index : Extract_Element <
+    i32, v11i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v11i32_#Index : Insert_Element <
+    i32, v11i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v11f32_#Index : Extract_Element <
+    f32, v11f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v11f32_#Index : Insert_Element <
+    f32, v11f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-11 in {
+  def Extract_Element_v12i32_#Index : Extract_Element <
+    i32, v12i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v12i32_#Index : Insert_Element <
+    i32, v12i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v12f32_#Index : Extract_Element <
+    f32, v12f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v12f32_#Index : Insert_Element <
+    f32, v12f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
 foreach Index = 0-15 in {
   def Extract_Element_v16i32_#Index : Extract_Element <
     i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1482,6 +1582,30 @@ def : BitConvert <v4i64, v16i16, VReg_256>;
 def : BitConvert <v4f64, v16f16, VReg_256>;
 def : BitConvert <v4f64, v16i16, VReg_256>;
 
+// 288-bit bitcast
+def : BitConvert <v9i32, v9f32, SReg_288>;
+def : BitConvert <v9f32, v9i32, SReg_288>;
+def : BitConvert <v9i32, v9f32, VReg_288>;
+def : BitConvert <v9f32, v9i32, VReg_288>;
+
+// 320-bit bitcast
+def : BitConvert <v10i32, v10f32, SReg_320>;
+def : BitConvert <v10f32, v10i32, SReg_320>;
+def : BitConvert <v10i32, v10f32, VReg_320>;
+def : BitConvert <v10f32, v10i32, VReg_320>;
+
+// 320-bit bitcast
+def : BitConvert <v11i32, v11f32, SReg_352>;
+def : BitConvert <v11f32, v11i32, SReg_352>;
+def : BitConvert <v11i32, v11f32, VReg_352>;
+def : BitConvert <v11f32, v11i32, VReg_352>;
+
+// 384-bit bitcast
+def : BitConvert <v12i32, v12f32, SReg_384>;
+def : BitConvert <v12f32, v12i32, SReg_384>;
+def : BitConvert <v12i32, v12f32, VReg_384>;
+def : BitConvert <v12f32, v12i32, VReg_384>;
+
 // 512-bit bitcast
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
@@ -2022,12 +2146,20 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
 defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
 defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
 defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">;
+defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">;
+defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">;
+defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">;
 defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
 defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
 
 defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
 defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
 defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">;
+defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">;
+defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">;
+defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">;
 defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
 defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
 

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d031c2808adcd..7384f1d083c4a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2449,6 +2449,14 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::VReg_224RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::VReg_256RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::VReg_288RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::VReg_320RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::VReg_352RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::VReg_384RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::VReg_512RegClass;
   if (BitWidth <= 1024)
@@ -2473,6 +2481,14 @@ getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::VReg_224_Align2RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::VReg_256_Align2RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::VReg_288_Align2RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::VReg_320_Align2RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::VReg_352_Align2RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::VReg_384_Align2RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::VReg_512_Align2RegClass;
   if (BitWidth <= 1024)
@@ -2509,6 +2525,14 @@ getAnyAGPRClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::AReg_224RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::AReg_256RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::AReg_288RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::AReg_320RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::AReg_352RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::AReg_384RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::AReg_512RegClass;
   if (BitWidth <= 1024)
@@ -2533,6 +2557,14 @@ getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::AReg_224_Align2RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::AReg_256_Align2RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::AReg_288_Align2RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::AReg_320_Align2RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::AReg_352_Align2RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::AReg_384_Align2RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::AReg_512_Align2RegClass;
   if (BitWidth <= 1024)
@@ -2567,6 +2599,14 @@ getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::AV_224RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::AV_256RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::AV_288RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::AV_320RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::AV_352RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::AV_384RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::AV_512RegClass;
   if (BitWidth <= 1024)
@@ -2591,6 +2631,14 @@ getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::AV_224_Align2RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::AV_256_Align2RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::AV_288_Align2RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::AV_320_Align2RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::AV_352_Align2RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::AV_384_Align2RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::AV_512_Align2RegClass;
   if (BitWidth <= 1024)
@@ -2630,6 +2678,14 @@ SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
     return &AMDGPU::SGPR_224RegClass;
   if (BitWidth <= 256)
     return &AMDGPU::SGPR_256RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::SGPR_288RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::SGPR_320RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::SGPR_352RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::SGPR_384RegClass;
   if (BitWidth <= 512)
     return &AMDGPU::SGPR_512RegClass;
   if (BitWidth <= 1024)
@@ -2686,6 +2742,26 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
     &AMDGPU::SReg_256RegClass,
     &AMDGPU::AReg_256_Align2RegClass,
     &AMDGPU::AReg_256RegClass,
+    &AMDGPU::VReg_288_Align2RegClass,
+    &AMDGPU::VReg_288RegClass,
+    &AMDGPU::SReg_288RegClass,
+    &AMDGPU::AReg_288_Align2RegClass,
+    &AMDGPU::AReg_288RegClass,
+    &AMDGPU::VReg_320_Align2RegClass,
+    &AMDGPU::VReg_320RegClass,
+    &AMDGPU::SReg_320RegClass,
+    &AMDGPU::AReg_320_Align2RegClass,
+    &AMDGPU::AReg_320RegClass,
+    &AMDGPU::VReg_352_Align2RegClass,
+    &AMDGPU::VReg_352RegClass,
+    &AMDGPU::SReg_352RegClass,
+    &AMDGPU::AReg_352_Align2RegClass,
+    &AMDGPU::AReg_352RegClass,
+    &AMDGPU::VReg_384_Align2RegClass,
+    &AMDGPU::VReg_384RegClass,
+    &AMDGPU::SReg_384RegClass,
+    &AMDGPU::AReg_384_Align2RegClass,
+    &AMDGPU::AReg_384RegClass,
     &AMDGPU::VReg_512_Align2RegClass,
     &AMDGPU::VReg_512RegClass,
     &AMDGPU::SReg_512RegClass,

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index b1e8761fb05e9..21b7c28eed66d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -60,6 +60,16 @@ class getSubRegs<int size> {
   list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
   list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
   list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
+  list<SubRegIndex> ret9 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8];
+  list<SubRegIndex> ret10 = [sub0, sub1, sub2, sub3,
+                             sub4, sub5, sub6, sub7,
+                             sub8, sub9];
+  list<SubRegIndex> ret11 = [sub0, sub1, sub2, sub3,
+                             sub4, sub5, sub6, sub7,
+                             sub8, sub9, sub10];
+  list<SubRegIndex> ret12 = [sub0, sub1, sub2, sub3,
+                             sub4, sub5, sub6, sub7,
+                             sub8, sub9, sub10, sub11];
   list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
                              sub4, sub5, sub6, sub7,
                              sub8, sub9, sub10, sub11,
@@ -80,8 +90,12 @@ class getSubRegs<int size> {
                                           !if(!eq(size, 6), ret6,
                                               !if(!eq(size, 7), ret7,
                                                   !if(!eq(size, 8), ret8,
-                                                      !if(!eq(size, 16), ret16,
-                                                          ret32))))))));
+                                                      !if(!eq(size, 9), ret9,
+                                                          !if(!eq(size, 10), ret10,
+                                                              !if(!eq(size, 11), ret11,
+                                                                  !if(!eq(size, 12), ret12,
+                                                                      !if(!eq(size, 16), ret16,
+                                                                          ret32))))))))))));
 }
 
 // Generates list of sequential register tuple names.
@@ -423,6 +437,18 @@ def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
 // SGPR 256-bit registers
 def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
 
+// SGPR 288-bit registers. No operations use these, but for symmetry with 288-bit VGPRs.
+def SGPR_288Regs : SIRegisterTuples<getSubRegs<9>.ret, SGPR_32, 105, 4, 9, "s">;
+
+// SGPR 320-bit registers. No operations use these, but for symmetry with 320-bit VGPRs.
+def SGPR_320Regs : SIRegisterTuples<getSubRegs<10>.ret, SGPR_32, 105, 4, 10, "s">;
+
+// SGPR 352-bit registers. No operations use these, but for symmetry with 352-bit VGPRs.
+def SGPR_352Regs : SIRegisterTuples<getSubRegs<11>.ret, SGPR_32, 105, 4, 11, "s">;
+
+// SGPR 384-bit registers. No operations use these, but for symmetry with 384-bit VGPRs.
+def SGPR_384Regs : SIRegisterTuples<getSubRegs<12>.ret, SGPR_32, 105, 4, 12, "s">;
+
 // SGPR 512-bit registers
 def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">;
 
@@ -465,6 +491,18 @@ def TTMP_224Regs : SIRegisterTuples<getSubRegs<7>.ret, TTMP_32, 15, 4, 7, "ttmp"
 // Trap handler TMP 256-bit registers
 def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
 
+// Trap handler TMP 288-bit registers
+def TTMP_288Regs : SIRegisterTuples<getSubRegs<9>.ret, TTMP_32, 15, 4, 9, "ttmp">;
+
+// Trap handler TMP 320-bit registers
+def TTMP_320Regs : SIRegisterTuples<getSubRegs<10>.ret, TTMP_32, 15, 4, 10, "ttmp">;
+
+// Trap handler TMP 352-bit registers
+def TTMP_352Regs : SIRegisterTuples<getSubRegs<11>.ret, TTMP_32, 15, 4, 11, "ttmp">;
+
+// Trap handler TMP 384-bit registers
+def TTMP_384Regs : SIRegisterTuples<getSubRegs<12>.ret, TTMP_32, 15, 4, 12, "ttmp">;
+
 // Trap handler TMP 512-bit registers
 def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
 
@@ -609,6 +647,18 @@ def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
 // VGPR 256-bit registers
 def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
 
+// VGPR 288-bit registers
+def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 255, 1, 9, "v">;
+
+// VGPR 320-bit registers
+def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 255, 1, 10, "v">;
+
+// VGPR 352-bit registers
+def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 255, 1, 11, "v">;
+
+// VGPR 384-bit registers
+def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 255, 1, 12, "v">;
+
 // VGPR 512-bit registers
 def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
 
@@ -653,6 +703,18 @@ def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
 // AGPR 256-bit registers
 def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
 
+// AGPR 288-bit registers
+def AGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, AGPR_32, 255, 1, 9, "a">;
+
+// AGPR 320-bit registers
+def AGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, AGPR_32, 255, 1, 10, "a">;
+
+// AGPR 352-bit registers
+def AGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, AGPR_32, 255, 1, 11, "a">;
+
+// AGPR 384-bit registers
+def AGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, AGPR_32, 255, 1, 12, "a">;
+
 // AGPR 512-bit registers
 def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
 
@@ -829,6 +891,10 @@ defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
 defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
+defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
+defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
+defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
 
 let GlobalPriority = true in {
 defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
@@ -873,6 +939,10 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
 defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
 defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
+defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
+defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
+defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
+defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
 
 let GlobalPriority = true in {
 defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
@@ -897,6 +967,10 @@ defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
 defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
 defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
 defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+defm AReg_288 : ARegClass<9, [v9i32, v9f32], (add AGPR_288)>;
+defm AReg_320 : ARegClass<10, [v10i32, v10f32], (add AGPR_320)>;
+defm AReg_352 : ARegClass<11, [v11i32, v11f32], (add AGPR_352)>;
+defm AReg_384 : ARegClass<12, [v12i32, v12f32], (add AGPR_384)>;
 
 let GlobalPriority = true in {
 defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
@@ -963,6 +1037,10 @@ defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
 defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>;
 defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>;
 defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>;
+defm AV_288 : AVRegClass<9, VReg_288.RegTypes, (add VGPR_288), (add AGPR_288)>;
+defm AV_320 : AVRegClass<10, VReg_320.RegTypes, (add VGPR_320), (add AGPR_320)>;
+defm AV_352 : AVRegClass<11, VReg_352.RegTypes, (add VGPR_352), (add AGPR_352)>;
+defm AV_384 : AVRegClass<12, VReg_384.RegTypes, (add VGPR_384), (add AGPR_384)>;
 
 let GlobalPriority = true in {
 defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;

diff  --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 0e01d93e11009..2b10be1c8b3c6 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -292,6 +292,14 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
     RC = &AMDGPU::VReg_224RegClass;
   } else if (Info->VAddrDwords == 8) {
     RC = &AMDGPU::VReg_256RegClass;
+  } else if (Info->VAddrDwords == 9) {
+    RC = &AMDGPU::VReg_288RegClass;
+  } else if (Info->VAddrDwords == 10) {
+    RC = &AMDGPU::VReg_320RegClass;
+  } else if (Info->VAddrDwords == 11) {
+    RC = &AMDGPU::VReg_352RegClass;
+  } else if (Info->VAddrDwords == 12) {
+    RC = &AMDGPU::VReg_384RegClass;
   } else {
     RC = &AMDGPU::VReg_512RegClass;
     NewAddrDwords = 16;

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 8d622f124d9aa..ce2f7f0642be1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2203,6 +2203,42 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AV_256RegClassID:
   case AMDGPU::AV_256_Align2RegClassID:
     return 256;
+  case AMDGPU::SGPR_288RegClassID:
+  case AMDGPU::SReg_288RegClassID:
+  case AMDGPU::VReg_288RegClassID:
+  case AMDGPU::AReg_288RegClassID:
+  case AMDGPU::VReg_288_Align2RegClassID:
+  case AMDGPU::AReg_288_Align2RegClassID:
+  case AMDGPU::AV_288RegClassID:
+  case AMDGPU::AV_288_Align2RegClassID:
+    return 288;
+  case AMDGPU::SGPR_320RegClassID:
+  case AMDGPU::SReg_320RegClassID:
+  case AMDGPU::VReg_320RegClassID:
+  case AMDGPU::AReg_320RegClassID:
+  case AMDGPU::VReg_320_Align2RegClassID:
+  case AMDGPU::AReg_320_Align2RegClassID:
+  case AMDGPU::AV_320RegClassID:
+  case AMDGPU::AV_320_Align2RegClassID:
+    return 320;
+  case AMDGPU::SGPR_352RegClassID:
+  case AMDGPU::SReg_352RegClassID:
+  case AMDGPU::VReg_352RegClassID:
+  case AMDGPU::AReg_352RegClassID:
+  case AMDGPU::VReg_352_Align2RegClassID:
+  case AMDGPU::AReg_352_Align2RegClassID:
+  case AMDGPU::AV_352RegClassID:
+  case AMDGPU::AV_352_Align2RegClassID:
+    return 352;
+  case AMDGPU::SGPR_384RegClassID:
+  case AMDGPU::SReg_384RegClassID:
+  case AMDGPU::VReg_384RegClassID:
+  case AMDGPU::AReg_384RegClassID:
+  case AMDGPU::VReg_384_Align2RegClassID:
+  case AMDGPU::AReg_384_Align2RegClassID:
+  case AMDGPU::AV_384RegClassID:
+  case AMDGPU::AV_384_Align2RegClassID:
+    return 384;
   case AMDGPU::SGPR_512RegClassID:
   case AMDGPU::SReg_512RegClassID:
   case AMDGPU::VReg_512RegClassID:

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 5e53c25d1207c..2bebc5ed9b53b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @add_i32() #0 {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v6i32 = add <6 x i32> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v7i32 = add <7 x i32> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = add <8 x i32> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9i32 = add <9 x i32> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9i32 = add <9 x i32> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'add_i32'
@@ -27,7 +27,7 @@ define amdgpu_kernel void @add_i32() #0 {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v6i32 = add <6 x i32> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v7i32 = add <7 x i32> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = add <8 x i32> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9i32 = add <9 x i32> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9i32 = add <9 x i32> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %i32 = add i32 undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index 3dbcbf2cfb1c4..b57f26cdc2928 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -50,7 +50,7 @@ define i32 @add(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -79,7 +79,7 @@ define i32 @add(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -108,7 +108,7 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -137,7 +137,7 @@ define i32 @add(i32 %arg) {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -230,7 +230,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -259,7 +259,7 @@ define i32 @sub(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -288,7 +288,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -317,7 +317,7 @@ define i32 @sub(i32 %arg) {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
index 2d074ee9f151f..b1ff4a4a0acb1 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
@@ -50,7 +50,7 @@ define i32 @add(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -79,7 +79,7 @@ define i32 @add(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -108,7 +108,7 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -137,7 +137,7 @@ define i32 @add(i32 %arg) {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -230,7 +230,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -259,7 +259,7 @@ define i32 @sub(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -288,7 +288,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
@@ -317,7 +317,7 @@ define i32 @sub(i32 %arg) {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index f18d55e4ef3d1..d22d8a98b4a43 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; NOPACKEDF32-LABEL: 'fadd_f32'
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32'
@@ -35,7 +35,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; NOPACKEDF32-SIZE-LABEL: 'fadd_f32'
@@ -45,7 +45,7 @@ define amdgpu_kernel void @fadd_f32() #0 {
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fadd float undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index ca9754d1c849b..11ce416b7fd79 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @fdiv_f32_ieee() #0 {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = fdiv <4 x float> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v5f32 = fdiv <5 x float> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %v9f32 = fdiv <9 x float> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'fdiv_f32_ieee'
@@ -30,7 +30,7 @@ define amdgpu_kernel void @fdiv_f32_ieee() #0 {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f32 = fdiv <4 x float> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v5f32 = fdiv <5 x float> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %v9f32 = fdiv <9 x float> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fdiv float undef, undef
@@ -51,7 +51,7 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4f32 = fdiv <4 x float> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v5f32 = fdiv <5 x float> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %v9f32 = fdiv <9 x float> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'fdiv_f32_ftzdaz'
@@ -61,7 +61,7 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = fdiv <4 x float> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v5f32 = fdiv <5 x float> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = fdiv <8 x float> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %v9f32 = fdiv <9 x float> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %v9f32 = fdiv <9 x float> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fdiv float undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index 9ed9e700284c6..a9f1210a598f0 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @fma_f32() #0 {
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FASTF64-LABEL: 'fma_f32'
@@ -27,7 +27,7 @@ define amdgpu_kernel void @fma_f32() #0 {
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fma_f32'
@@ -37,7 +37,7 @@ define amdgpu_kernel void @fma_f32() #0 {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-SIZE-LABEL: 'fma_f32'
@@ -47,7 +47,7 @@ define amdgpu_kernel void @fma_f32() #0 {
 ; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
 ; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
 ; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
 ; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; FASTF64-SIZE-LABEL: 'fma_f32'
@@ -57,7 +57,7 @@ define amdgpu_kernel void @fma_f32() #0 {
 ; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
 ; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
 ; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
 ; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fma_f32'
@@ -67,7 +67,7 @@ define amdgpu_kernel void @fma_f32() #0 {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 29528136decd6..c8dab09e0dbf7 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; F32-LABEL: 'fmul_f32'
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
 ; F32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef
 ; F32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef
 ; F32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; F32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef
 ; F32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-SIZE-LABEL: 'fmul_f32'
@@ -35,7 +35,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
 ; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef
 ; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef
 ; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef
 ; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmul_f32'
@@ -45,7 +45,7 @@ define amdgpu_kernel void @fmul_f32() #0 {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fmul float undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 0142e55ce35bf..b3bf580e75e66 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; NOPACKEDF32-LABEL: 'fsub_f32'
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32'
@@ -35,7 +35,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; NOPACKEDF32-SIZE-LABEL: 'fsub_f32'
@@ -45,7 +45,7 @@ define amdgpu_kernel void @fsub_f32() #0 {
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fsub float undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index de9ba82bc2380..1444db7248330 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @mul_i32() #0 {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i32 = mul <4 x i32> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5i32 = mul <5 x i32> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i32 = mul <8 x i32> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v9i32 = mul <9 x i32> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9i32 = mul <9 x i32> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'mul_i32'
@@ -23,7 +23,7 @@ define amdgpu_kernel void @mul_i32() #0 {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = mul <4 x i32> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5i32 = mul <5 x i32> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = mul <8 x i32> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v9i32 = mul <9 x i32> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9i32 = mul <9 x i32> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %i32 = mul i32 undef, undef

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index b668c42e99fe2..72fdd481dbb67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -2671,6 +2671,1208 @@ entry:
   ret void
 }
 
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT:    s_set_gpr_idx_on s11, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v9
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v9f32_s_v_s:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_mov_b32 m0, s11
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    v_mov_b32_e32 v7, s7
+; GFX10-NEXT:    v_mov_b32_e32 v8, s8
+; GFX10-NEXT:    v_movreld_b32_e32 v0, v9
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v9f32_s_v_s:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT:    s_mov_b32 m0, s11
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT:    v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT:    v_movreld_b32_e32 v0, v9
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+  ret <9 x float> %insert
+}
+
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_v(<9 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s0
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v11, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v12, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v13, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v14, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v15, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s7
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v16, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v17, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v18, v0, vcc
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v9
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v9f32_s_v_v:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v18, s8
+; GFX10-NEXT:    v_mov_b32_e32 v10, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, s1
+; GFX10-NEXT:    v_mov_b32_e32 v12, s2
+; GFX10-NEXT:    v_mov_b32_e32 v13, s3
+; GFX10-NEXT:    v_mov_b32_e32 v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, s5
+; GFX10-NEXT:    v_mov_b32_e32 v16, s6
+; GFX10-NEXT:    v_mov_b32_e32 v17, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v18, v0, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v9f32_s_v_v:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    v_dual_mov_b32 v18, s8 :: v_dual_mov_b32 v17, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
+; GFX11-NEXT:    v_mov_b32_e32 v10, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v13, s3
+; GFX11-NEXT:    v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v15, s5
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT:    v_dual_mov_b32 v1, v9 :: v_dual_cndmask_b32 v8, v18, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+  ret <9 x float> %insert
+}
+
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_v_v_s(<9 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v9
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_s:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
+; GFX10PLUS-NEXT:    v_movreld_b32_e32 v0, v9
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+  ret <9 x float> %insert
+}
+
+define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_v_v_v(<9 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v9f32_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_v:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v10
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <9 x float> %vec, float %val, i32 %idx
+  ret <9 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT:    s_set_gpr_idx_on s12, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v10f32_s_v_s:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_mov_b32 m0, s12
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    v_mov_b32_e32 v7, s7
+; GFX10-NEXT:    v_mov_b32_e32 v8, s8
+; GFX10-NEXT:    v_mov_b32_e32 v9, s9
+; GFX10-NEXT:    v_movreld_b32_e32 v0, v10
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v10f32_s_v_s:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    s_mov_b32 s9, s11
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_mov_b32 m0, s12
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT:    v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT:    v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT:    v_movreld_b32_e32 v0, v10
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+  ret <10 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_v(<10 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s0
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v12, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v13, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v14, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v15, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s7
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v16, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, s8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v17, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 8, v1
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v18, v0, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v19, v0, vcc
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v11
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v10f32_s_v_v:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    v_mov_b32_e32 v19, s9
+; GFX10-NEXT:    v_mov_b32_e32 v10, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, s1
+; GFX10-NEXT:    v_mov_b32_e32 v12, s2
+; GFX10-NEXT:    v_mov_b32_e32 v13, s3
+; GFX10-NEXT:    v_mov_b32_e32 v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, s5
+; GFX10-NEXT:    v_mov_b32_e32 v16, s6
+; GFX10-NEXT:    v_mov_b32_e32 v17, s7
+; GFX10-NEXT:    v_mov_b32_e32 v18, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v18, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v19, v0, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v10f32_s_v_v:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s9, s11
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    v_dual_mov_b32 v19, s9 :: v_dual_mov_b32 v18, s8
+; GFX11-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX11-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v12, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v14, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v15, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v18, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v11
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v19, v0 :: v_dual_mov_b32 v0, v10
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+  ret <10 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_v_v_s(<10 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_s:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
+; GFX10PLUS-NEXT:    v_movreld_b32_e32 v0, v10
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+  ret <10 x float> %insert
+}
+
+define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_v_v_v(<10 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v10f32_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_v:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v11
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <10 x float> %vec, float %val, i32 %idx
+  ret <10 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s10
+; GPRIDX-NEXT:    s_set_gpr_idx_on s13, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v11
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v11f32_s_v_s:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    s_mov_b32 s10, s12
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_mov_b32 m0, s13
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    v_mov_b32_e32 v7, s7
+; GFX10-NEXT:    v_mov_b32_e32 v8, s8
+; GFX10-NEXT:    v_mov_b32_e32 v9, s9
+; GFX10-NEXT:    v_mov_b32_e32 v10, s10
+; GFX10-NEXT:    v_movreld_b32_e32 v0, v11
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v11f32_s_v_s:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    s_mov_b32 s9, s11
+; GFX11-NEXT:    s_mov_b32 s10, s12
+; GFX11-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT:    s_mov_b32 m0, s13
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT:    v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT:    v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s10
+; GFX11-NEXT:    v_movreld_b32_e32 v0, v11
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+  ret <11 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_v(<11 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v22, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s0
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v11, v13, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v14, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v15, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v16, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v17, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v21, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v20, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s7
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v18, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 9, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 10, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v1
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v19, v0, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v20, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v21, v0, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v22, v0, s[2:3]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v11
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v11f32_s_v_v:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    s_mov_b32 s10, s12
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    v_mov_b32_e32 v22, s10
+; GFX10-NEXT:    v_mov_b32_e32 v12, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, s1
+; GFX10-NEXT:    v_mov_b32_e32 v14, s2
+; GFX10-NEXT:    v_mov_b32_e32 v15, s3
+; GFX10-NEXT:    v_mov_b32_e32 v16, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v17, s5
+; GFX10-NEXT:    v_mov_b32_e32 v18, s6
+; GFX10-NEXT:    v_mov_b32_e32 v19, s7
+; GFX10-NEXT:    v_mov_b32_e32 v20, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v21, s9
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v22, v0, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, v12
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v11f32_s_v_v:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    s_mov_b32 s10, s12
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s9, s11
+; GFX11-NEXT:    v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v21, s9
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_mov_b32_e32 v12, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_dual_mov_b32 v16, s4 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v17, s5
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v19, s7
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX11-NEXT:    v_dual_mov_b32 v1, v11 :: v_dual_cndmask_b32 v10, v22, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v12
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+  ret <11 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_v_v_s(<11 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v11
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_s:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
+; GFX10PLUS-NEXT:    v_movreld_b32_e32 v0, v11
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+  ret <11 x float> %insert
+}
+
+define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_v_v_v(<11 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v11f32_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v12
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_v:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v12
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <11 x float> %vec, float %val, i32 %idx
+  ret <11 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s11
+; GPRIDX-NEXT:    s_set_gpr_idx_on s14, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v12f32_s_v_s:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    s_mov_b32 s10, s12
+; GFX10-NEXT:    s_mov_b32 s11, s13
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_mov_b32 m0, s14
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    v_mov_b32_e32 v7, s7
+; GFX10-NEXT:    v_mov_b32_e32 v8, s8
+; GFX10-NEXT:    v_mov_b32_e32 v9, s9
+; GFX10-NEXT:    v_mov_b32_e32 v10, s10
+; GFX10-NEXT:    v_mov_b32_e32 v11, s11
+; GFX10-NEXT:    v_movreld_b32_e32 v0, v12
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v12f32_s_v_s:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    s_mov_b32 s9, s11
+; GFX11-NEXT:    s_mov_b32 s10, s12
+; GFX11-NEXT:    s_mov_b32 s11, s13
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_mov_b32 m0, s14
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT:    v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT:    v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT:    v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10
+; GFX11-NEXT:    v_movreld_b32_e32 v0, v12
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+  ret <12 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_v(<12 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v23, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s0
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v14, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v15, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v16, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v22, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v21, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v20, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v17, v0, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 8, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 9, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 10, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 11, v1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v18, v0, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v20, v0, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v21, v0, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v22, v0, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v23, v0, s[6:7]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v13
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_insertelement_v12f32_s_v_v:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    s_mov_b32 s11, s13
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, s10
+; GFX10-NEXT:    s_mov_b32 s10, s12
+; GFX10-NEXT:    v_mov_b32_e32 v23, s11
+; GFX10-NEXT:    v_mov_b32_e32 v12, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, s1
+; GFX10-NEXT:    v_mov_b32_e32 v14, s2
+; GFX10-NEXT:    v_mov_b32_e32 v15, s3
+; GFX10-NEXT:    v_mov_b32_e32 v16, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v17, s5
+; GFX10-NEXT:    v_mov_b32_e32 v18, s6
+; GFX10-NEXT:    v_mov_b32_e32 v19, s7
+; GFX10-NEXT:    v_mov_b32_e32 v20, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v21, s9
+; GFX10-NEXT:    v_mov_b32_e32 v22, s10
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v22, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v23, v0, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, v12
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_insertelement_v12f32_s_v_v:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s7, s9
+; GFX11-NEXT:    s_mov_b32 s9, s11
+; GFX11-NEXT:    s_mov_b32 s11, s13
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s8, s10
+; GFX11-NEXT:    s_mov_b32 s10, s12
+; GFX11-NEXT:    v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10
+; GFX11-NEXT:    v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v14, s2
+; GFX11-NEXT:    v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6
+; GFX11-NEXT:    v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v14, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v15, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v18, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v20, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v22, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v11, v23, v0 :: v_dual_mov_b32 v0, v12
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+  ret <12 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_v_v_s(<12 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v12
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_s:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
+; GFX10PLUS-NEXT:    v_movreld_b32_e32 v0, v12
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+  ret <12 x float> %insert
+}
+
+define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_v_v_v(<12 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v12f32_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v13
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_v:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 11, v13
+; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <12 x float> %vec, float %val, i32 %idx
+  ret <12 x float> %insert
+}
+
 define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_s_s(<16 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
 ; GPRIDX-LABEL: dyn_insertelement_v16i32_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
@@ -5246,47 +6448,41 @@ entry:
 define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) {
 ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
 ; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
 ; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
 ; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
 ; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, s15
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, s14
-; GPRIDX-NEXT:    v_mov_b32_e32 v15, s13
-; GPRIDX-NEXT:    v_mov_b32_e32 v14, s12
-; GPRIDX-NEXT:    v_mov_b32_e32 v13, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v12, s10
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s9
-; GPRIDX-NEXT:    v_mov_b32_e32 v10, s8
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, s7
-; GPRIDX-NEXT:    v_mov_b32_e32 v8, s6
-; GPRIDX-NEXT:    v_mov_b32_e32 v7, s5
-; GPRIDX-NEXT:    v_mov_b32_e32 v6, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v5, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v4, s2
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s1
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 0
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 2
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s2
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[4:5]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v1, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 2
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 3
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v2
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v3
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v4
@@ -5301,22 +6497,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
 ;
 ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_s:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
-; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    s_mov_b32 s9, s11
-; GFX10-NEXT:    v_mov_b32_e32 v17, s15
-; GFX10-NEXT:    v_mov_b32_e32 v16, s14
-; GFX10-NEXT:    v_mov_b32_e32 v15, s13
-; GFX10-NEXT:    v_mov_b32_e32 v14, s12
-; GFX10-NEXT:    v_mov_b32_e32 v13, s11
-; GFX10-NEXT:    v_mov_b32_e32 v12, s10
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v11, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v10, s8
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s7
@@ -5356,19 +6546,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
 ;
 ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_s:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_mov_b32 s0, s2
 ; GFX11-NEXT:    s_mov_b32 s1, s3
-; GFX11-NEXT:    s_mov_b32 s2, s4
 ; GFX11-NEXT:    s_mov_b32 s3, s5
-; GFX11-NEXT:    s_mov_b32 s4, s6
 ; GFX11-NEXT:    s_mov_b32 s5, s7
-; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    s_mov_b32 s8, s10
 ; GFX11-NEXT:    s_mov_b32 s9, s11
-; GFX11-NEXT:    v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
-; GFX11-NEXT:    v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
-; GFX11-NEXT:    v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s8, s10
 ; GFX11-NEXT:    v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
 ; GFX11-NEXT:    v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
 ; GFX11-NEXT:    v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
@@ -5406,77 +6593,65 @@ entry:
 define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) {
 ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
 ; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
 ; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
 ; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
 ; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v18, s15
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, s14
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, s13
-; GPRIDX-NEXT:    v_mov_b32_e32 v15, s12
-; GPRIDX-NEXT:    v_mov_b32_e32 v14, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v13, s10
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s9
-; GPRIDX-NEXT:    v_mov_b32_e32 v11, s8
-; GPRIDX-NEXT:    v_mov_b32_e32 v10, s7
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, s6
-; GPRIDX-NEXT:    v_mov_b32_e32 v8, s5
-; GPRIDX-NEXT:    v_mov_b32_e32 v7, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v6, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v5, s2
 ; GPRIDX-NEXT:    v_mov_b32_e32 v4, s1
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v2
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s2
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v5, v0, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v7, v0, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v9, v0, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v11, v0, s[4:5]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v1, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v10, v1, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[4:5]
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 4, v2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v11, v0, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[0:1]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v3
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v4
-; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v5
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v6
-; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v7
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v8
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v7
-; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v2
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v0
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v1
 ; GPRIDX-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
-; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    s_mov_b32 s9, s11
-; GFX10-NEXT:    v_mov_b32_e32 v18, s15
-; GFX10-NEXT:    v_mov_b32_e32 v17, s14
-; GFX10-NEXT:    v_mov_b32_e32 v16, s13
-; GFX10-NEXT:    v_mov_b32_e32 v15, s12
-; GFX10-NEXT:    v_mov_b32_e32 v14, s11
-; GFX10-NEXT:    v_mov_b32_e32 v13, s10
+; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    s_mov_b32 s2, s4
+; GFX10-NEXT:    s_mov_b32 s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v12, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v11, s8
 ; GFX10-NEXT:    v_mov_b32_e32 v10, s7
@@ -5516,19 +6691,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
 ;
 ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_mov_b32 s0, s2
 ; GFX11-NEXT:    s_mov_b32 s1, s3
-; GFX11-NEXT:    s_mov_b32 s2, s4
 ; GFX11-NEXT:    s_mov_b32 s3, s5
-; GFX11-NEXT:    s_mov_b32 s4, s6
 ; GFX11-NEXT:    s_mov_b32 s5, s7
-; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    s_mov_b32 s8, s10
 ; GFX11-NEXT:    s_mov_b32 s9, s11
-; GFX11-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
-; GFX11-NEXT:    v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
-; GFX11-NEXT:    v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_mov_b32 s2, s4
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s6, s8
+; GFX11-NEXT:    s_mov_b32 s8, s10
 ; GFX11-NEXT:    v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
 ; GFX11-NEXT:    v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
 ; GFX11-NEXT:    v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
@@ -5596,55 +6768,55 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
 ; GFX10-LABEL: dyn_insertelement_v5f64_v_v_s:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s2, 4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 3
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 4
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
 ; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s2, 3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s2, 2
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, s2, 4
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s1
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s8, v8
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX11-NEXT:    v_readfirstlane_b32 s8, v8
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v7
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v9
 ; GFX11-NEXT:    ; return to shader part epilog
 entry:
@@ -5685,14 +6857,19 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
 ; GFX10-LABEL: dyn_insertelement_v5f64_v_v_v:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v12
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@@ -5702,38 +6879,33 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
 ; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v12
 ; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
 ; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v12
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 4, v12
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s1
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v12
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s8, v8
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX11-NEXT:    v_readfirstlane_b32 s8, v8
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v7
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v9
 ; GFX11-NEXT:    ; return to shader part epilog
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
index 355ffd1456dc3..d6a433ae00076 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
@@ -688,7 +688,7 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
     ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
     ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
     %0:sgpr(<3 x s64>) = G_IMPLICIT_DEF
     %1:sgpr(<3 x s64>) = G_IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
index 440e475eedc4b..67142e720219b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir
@@ -296,7 +296,7 @@ body: |
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr3_sgpr4_sgpr5
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8
     ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_96 = COPY $sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11
     ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2
     ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5
     ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8
@@ -332,7 +332,7 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_192 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_192 = COPY $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_384 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2
     ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5
     ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 463b59a4fd9f2..5a7ddddedd279 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -18,7 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh_intersect_ray:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
@@ -30,7 +30,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh_intersect_ray_flat:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
   %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
@@ -78,7 +78,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh64_intersect_ray:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
@@ -89,7 +89,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ra
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh64_intersect_ray_flat:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
   %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
@@ -118,7 +118,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
 ; GFX10-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
 ; GFX10-NEXT:    v_and_or_b32 v7, v7, 0xffff, v11
-; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -159,7 +159,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
 ; GFX1030-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[15:30], s[4:7]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[15:25], s[4:7]
 ; GFX1030-NEXT:    ; implicit-def: $vgpr11
 ; GFX1030-NEXT:    ; implicit-def: $vgpr15
 ; GFX1030-NEXT:    ; implicit-def: $vgpr16
@@ -182,34 +182,30 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v16, v11
-; GFX1013-NEXT:    v_mov_b32_e32 v17, v12
-; GFX1013-NEXT:    v_mov_b32_e32 v18, v13
-; GFX1013-NEXT:    v_mov_b32_e32 v19, v14
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1013-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
-; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
-; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
-; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
-; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v13
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v14
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
 ; GFX1013-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT:    image_bvh_intersect_ray v[20:23], v[0:15], s[4:7]
-; GFX1013-NEXT:    ; implicit-def: $vgpr16
-; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT:    image_bvh_intersect_ray v[15:18], v[0:10], s[4:7]
+; GFX1013-NEXT:    ; implicit-def: $vgpr11
+; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10
+; GFX1013-NEXT:    ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1013-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX1013-NEXT:  ; %bb.2:
 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
-; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
-; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
-; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v16
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v17
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX1013-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr:
@@ -391,7 +387,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
 ; GFX1030-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7]
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7]
 ; GFX1030-NEXT:    ; implicit-def: $vgpr12
 ; GFX1030-NEXT:    ; implicit-def: $vgpr16
 ; GFX1030-NEXT:    ; implicit-def: $vgpr17
@@ -415,34 +411,30 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
 ;
 ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v16, v12
-; GFX1013-NEXT:    v_mov_b32_e32 v17, v13
-; GFX1013-NEXT:    v_mov_b32_e32 v18, v14
-; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1013-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
-; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
-; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
-; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
-; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v12
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v13
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v14
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v15
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
 ; GFX1013-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7]
-; GFX1013-NEXT:    ; implicit-def: $vgpr16
-; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7]
+; GFX1013-NEXT:    ; implicit-def: $vgpr12
+; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+; GFX1013-NEXT:    ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1013-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX1013-NEXT:  ; %bb.2:
 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
-; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
-; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
-; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v16
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v17
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v18
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v19
 ; GFX1013-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr:
@@ -508,7 +500,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
 ; GFX1030-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16
 ; GFX1030-NEXT:    ; implicit-def: $vgpr10
 ; GFX1030-NEXT:    ; implicit-def: $vgpr14
 ; GFX1030-NEXT:    ; implicit-def: $vgpr15
@@ -529,42 +521,38 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ;
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v16, v10
-; GFX1013-NEXT:    v_mov_b32_e32 v17, v11
-; GFX1013-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX1013-NEXT:    v_and_b32_e32 v11, 0xffff, v8
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX1013-NEXT:    v_and_b32_e32 v15, 0xffff, v8
 ; GFX1013-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX1013-NEXT:    v_mov_b32_e32 v18, v12
-; GFX1013-NEXT:    v_mov_b32_e32 v19, v13
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1013-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
-; GFX1013-NEXT:    v_and_or_b32 v7, v7, 0xffff, v11
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
+; GFX1013-NEXT:    v_and_or_b32 v6, v6, 0xffff, v14
+; GFX1013-NEXT:    v_and_or_b32 v7, v7, 0xffff, v15
 ; GFX1013-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
-; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
-; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
-; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
-; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v10
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v11
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v12
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v13
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
 ; GFX1013-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] a16
-; GFX1013-NEXT:    ; implicit-def: $vgpr16
-; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16
+; GFX1013-NEXT:    ; implicit-def: $vgpr10
+; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+; GFX1013-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1013-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX1013-NEXT:  ; %bb.2:
 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
-; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
-; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
-; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v14
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v16
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v17
 ; GFX1013-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
@@ -631,7 +619,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -661,7 +649,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -885,7 +873,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -914,7 +902,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 ; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -1012,7 +1000,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX1030-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1030-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -1056,7 +1044,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, s0
 ; GFX1013-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] a16
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
index 70e486c22187d..34f83b0ae2ecb 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
 # Check that %11 and %20 have been coalesced.
-# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]]
-# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]]
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG:[0-9]+]]
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG]]
 
 ---
 name:            main
@@ -17,9 +17,9 @@ registers:
   - { id: 6, class: sgpr_128 }
   - { id: 7, class: sgpr_512 }
   - { id: 9, class: vreg_512 }
-  - { id: 11, class: vreg_512 }
+  - { id: 11, class: vreg_352 }
   - { id: 18, class: vgpr_32 }
-  - { id: 20, class: vreg_512 }
+  - { id: 20, class: vreg_352 }
   - { id: 27, class: vgpr_32 }
 liveins:
   - { reg: '$sgpr2_sgpr3', virtual-reg: '%0' }
@@ -61,7 +61,7 @@ body:             |
     %11.sub6 = COPY %1
     %11.sub7 = COPY %1
     %11.sub8 = COPY %1
-    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
+    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V11 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
     %20.sub1 = COPY %2
     %20.sub2 = COPY %2
     %20.sub3 = COPY %2
@@ -70,6 +70,6 @@ body:             |
     %20.sub6 = COPY %2
     %20.sub7 = COPY %2
     %20.sub8 = COPY %2
-    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
+    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V11 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index d17ed9302fb5c..eb0fc7e06d177 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -171,14 +171,20 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s26, -1
+; SI-NEXT:    s_mov_b32 s27, 0xe8f000
+; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s22, s10
 ; SI-NEXT:    s_mov_b32 s23, s11
@@ -197,24 +203,30 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
 ;
 ; VI-LABEL: test_copy_v4i8_x4:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s90, -1
+; VI-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NEXT:    s_add_u32 s88, s88, s3
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x44
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
 ; VI-NEXT:    s_mov_b32 s10, -1
-; VI-NEXT:    s_mov_b32 s14, s10
+; VI-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
-; VI-NEXT:    s_mov_b32 s22, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_mov_b32 s22, s10
 ; VI-NEXT:    s_mov_b32 s23, s11
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
index 2b8a712b28c05..2b4651487eff6 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
@@ -7,21 +7,27 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
 ; RRLIST-LABEL: sccClobber:
 ; RRLIST:       ; %bb.0: ; %entry
 ; RRLIST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; RRLIST-NEXT:    v_mov_b32_e32 v2, 0
+; RRLIST-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; RRLIST-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; RRLIST-NEXT:    s_mov_b32 s22, -1
+; RRLIST-NEXT:    s_mov_b32 s23, 0xe00000
+; RRLIST-NEXT:    s_add_u32 s20, s20, s3
 ; RRLIST-NEXT:    s_waitcnt lgkmcnt(0)
 ; RRLIST-NEXT:    s_load_dword s16, s[8:9], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x44
 ; RRLIST-NEXT:    s_load_dword s17, s[10:11], 0x0
+; RRLIST-NEXT:    s_addc_u32 s21, s21, 0
 ; RRLIST-NEXT:    s_waitcnt lgkmcnt(0)
-; RRLIST-NEXT:    s_min_i32 s4, s16, 0
 ; RRLIST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; RRLIST-NEXT:    s_min_i32 s4, s16, 0
 ; RRLIST-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; RRLIST-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; RRLIST-NEXT:    s_cselect_b32 s0, s16, s17
 ; RRLIST-NEXT:    s_cmp_eq_u64 s[12:13], s[2:3]
 ; RRLIST-NEXT:    s_cselect_b32 s0, s4, s0
+; RRLIST-NEXT:    v_mov_b32_e32 v2, 0
 ; RRLIST-NEXT:    v_mov_b32_e32 v0, s0
 ; RRLIST-NEXT:    global_store_dword v2, v0, s[14:15]
 ; RRLIST-NEXT:    s_endpgm
@@ -29,21 +35,27 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
 ; FAST-LABEL: sccClobber:
 ; FAST:       ; %bb.0: ; %entry
 ; FAST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; FAST-NEXT:    v_mov_b32_e32 v2, 0
+; FAST-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; FAST-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; FAST-NEXT:    s_mov_b32 s22, -1
+; FAST-NEXT:    s_mov_b32 s23, 0xe00000
+; FAST-NEXT:    s_add_u32 s20, s20, s3
 ; FAST-NEXT:    s_waitcnt lgkmcnt(0)
 ; FAST-NEXT:    s_load_dword s16, s[8:9], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x44
 ; FAST-NEXT:    s_load_dword s17, s[10:11], 0x0
+; FAST-NEXT:    s_addc_u32 s21, s21, 0
 ; FAST-NEXT:    s_waitcnt lgkmcnt(0)
-; FAST-NEXT:    s_min_i32 s4, s16, 0
 ; FAST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; FAST-NEXT:    s_min_i32 s4, s16, 0
 ; FAST-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; FAST-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; FAST-NEXT:    s_cselect_b32 s0, s16, s17
 ; FAST-NEXT:    s_cmp_eq_u64 s[12:13], s[2:3]
 ; FAST-NEXT:    s_cselect_b32 s0, s4, s0
+; FAST-NEXT:    v_mov_b32_e32 v2, 0
 ; FAST-NEXT:    v_mov_b32_e32 v0, s0
 ; FAST-NEXT:    global_store_dword v2, v0, s[14:15]
 ; FAST-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index ff0171095b842..8110e04c133f1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -310,7 +310,7 @@ define <4 x i64> @v4i64_func_void() #0 {
 ; GCN-LABEL: {{^}}v5i64_func_void:
 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx2 v[8:9], off
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <5 x i64> @v5i64_func_void() #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e1b9bb1af72a5..446f81e45415c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -385,22 +385,19 @@ bb7:                                              ; preds = %bb4, %bb1
 ; GCN: s_load_dword [[ARG:s[0-9]+]]
 
 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
+; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 ; MOVREL: s_waitcnt
 ; MOVREL: s_add_i32 m0, [[ARG]], -16
 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
-; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
 ; MOVREL: s_mov_b32 m0, -1
 
 
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
+; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 ; IDXMODE: s_waitcnt
 ; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
-; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; IDXMODE: s_set_gpr_idx_off
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
-; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
 ; IDXMODE: s_set_gpr_idx_off
 

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 54111a4e1a09d..ade5d5154d749 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -610,10 +610,16 @@ entry:
 define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double5_inselt:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s18, -1
+; GCN-NEXT:    s_mov_b32 s19, 0xe80000
+; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_load_dword s12, s[0:1], 0xa4
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
 ; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
+; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 4
 ; GCN-NEXT:    s_cselect_b32 s9, 0x3ff00000, s9
@@ -622,10 +628,8 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
 ; GCN-NEXT:    s_cselect_b32 s3, 0x3ff00000, s3
 ; GCN-NEXT:    s_cselect_b32 s2, 0, s2
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, s8
-; GCN-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-NEXT:    s_cselect_b32 s8, 0x3ff00000, s1
-; GCN-NEXT:    s_cselect_b32 s9, 0, s0
+; GCN-NEXT:    s_cselect_b32 s13, 0x3ff00000, s1
+; GCN-NEXT:    s_cselect_b32 s14, 0, s0
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 3
 ; GCN-NEXT:    s_cselect_b32 s0, 0x3ff00000, s7
 ; GCN-NEXT:    s_cselect_b32 s1, 0, s6
@@ -636,23 +640,26 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
 ; GCN-NEXT:    s_add_u32 s0, s10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    s_addc_u32 s1, s11, 0
-; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_mov_b32_e32 v6, s0
-; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v6, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NEXT:    s_add_u32 s0, s10, 32
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    v_mov_b32_e32 v7, s11
-; GCN-NEXT:    s_add_u32 s0, s10, 32
-; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-NEXT:    s_addc_u32 s1, s11, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index c47760a01547f..aaa4833482e30 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -6,7 +6,7 @@
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
 ; individual elements instead of 128-bit stores.
 
-define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v2f32_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -37,7 +37,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v2f32_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -68,7 +68,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
 ; SI-LABEL: insertelement_v2i32_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -99,7 +99,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
 ; SI-LABEL: insertelement_v2i32_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -132,7 +132,7 @@ define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <
 
 ; FIXME: Why is the constant moved into the intermediate register and
 ; not just directly into the vector component?
-define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v4f32_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -167,7 +167,7 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v4f32_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -202,7 +202,7 @@ define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v4f32_2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -237,7 +237,7 @@ define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v4f32_3:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -272,7 +272,7 @@ define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind {
 ; SI-LABEL: insertelement_v4i32_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -307,7 +307,7 @@ define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v3f32_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -338,7 +338,7 @@ define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind {
 ; SI-LABEL: insertelement_v3f32_2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -369,7 +369,7 @@ define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out,
   ret void
 }
 
-define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind {
 ; GCN-LABEL: insertelement_v3f32_3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_endpgm
@@ -394,7 +394,107 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
   ret <4 x float> %tmp2
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+define <9 x float> @insertelement_to_v9f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v9f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0xc0a00000
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x41880000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_mov_b32_e32 v6, s10
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %tmp = load <9 x float>, <9 x float> addrspace(4)* undef
+  %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0
+  %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2
+  %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7
+  ret <9 x float> %tmp3
+}
+
+define <10 x float> @insertelement_to_v10f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v10f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 2.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_mov_b32_e32 v6, s10
+; GCN-NEXT:    v_mov_b32_e32 v7, s11
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, s12
+; GCN-NEXT:    v_mov_b32_e32 v9, s13
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %tmp = load <10 x float>, <10 x float> addrspace(4)* undef
+  %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0
+  ret <10 x float> %tmp1
+}
+
+define <11 x float> @insertelement_to_v11f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v11f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_mov_b32_e32 v6, s10
+; GCN-NEXT:    v_mov_b32_e32 v7, s11
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, s12
+; GCN-NEXT:    v_mov_b32_e32 v9, s13
+; GCN-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %tmp = load <11 x float>, <11 x float> addrspace(4)* undef
+  %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0
+  ret <11 x float> %tmp1
+}
+
+define <12 x float> @insertelement_to_v12f32_undef() nounwind {
+; GCN-LABEL: insertelement_to_v12f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_mov_b32_e32 v6, s10
+; GCN-NEXT:    v_mov_b32_e32 v7, s11
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, s12
+; GCN-NEXT:    v_mov_b32_e32 v9, s13
+; GCN-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-NEXT:    v_mov_b32_e32 v11, s15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %tmp = load <12 x float>, <12 x float> addrspace(4)* undef
+  %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0
+  ret <12 x float> %tmp1
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -441,7 +541,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
@@ -494,7 +594,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
@@ -555,7 +655,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -606,7 +706,249 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)*
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v9f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    v_mov_b32_e32 v9, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_movreld_b32_e32 v0, v9
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v9f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v9, 0x40a00000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    v_mov_b32_e32 v8, s6
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_movreld_b32_e32 v0, v9
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b
+  store <9 x float> %vecins, <9 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v10f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    v_mov_b32_e32 v10, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    v_mov_b32_e32 v9, s7
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_movreld_b32_e32 v0, v10
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v10f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v10, 0x40a00000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    v_mov_b32_e32 v8, s6
+; VI-NEXT:    v_mov_b32_e32 v9, s7
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_movreld_b32_e32 v0, v10
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b
+  store <10 x float> %vecins, <10 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v11f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    v_mov_b32_e32 v11, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_movreld_b32_e32 v0, v11
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v11f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v11, 0x40a00000
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v8, s8
+; VI-NEXT:    v_mov_b32_e32 v9, s9
+; VI-NEXT:    v_mov_b32_e32 v10, s10
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    v_movreld_b32_e32 v0, v11
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b
+  store <11 x float> %vecins, <11 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v12f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    v_mov_b32_e32 v12, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    v_mov_b32_e32 v11, s19
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_movreld_b32_e32 v0, v12
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v12f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v12, 0x40a00000
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v8, s8
+; VI-NEXT:    v_mov_b32_e32 v9, s9
+; VI-NEXT:    v_mov_b32_e32 v10, s10
+; VI-NEXT:    v_mov_b32_e32 v11, s11
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    v_movreld_b32_e32 v0, v12
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b
+  store <12 x float> %vecins, <12 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -677,7 +1019,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -718,7 +1060,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
@@ -763,7 +1105,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
@@ -816,7 +1158,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
@@ -865,7 +1207,241 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v9i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    v_movreld_b32_e32 v0, 5
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v9i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    v_mov_b32_e32 v8, s6
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    v_movreld_b32_e32 v0, 5
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <9 x i32> %a, i32 5, i32 %b
+  store <9 x i32> %vecins, <9 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v10i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    v_mov_b32_e32 v9, s7
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    v_movreld_b32_e32 v0, 5
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v10i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    v_mov_b32_e32 v8, s6
+; VI-NEXT:    v_mov_b32_e32 v9, s7
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_movreld_b32_e32 v0, 5
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <10 x i32> %a, i32 5, i32 %b
+  store <10 x i32> %vecins, <10 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v11i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    v_movreld_b32_e32 v0, 5
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v11i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v8, s8
+; VI-NEXT:    v_mov_b32_e32 v9, s9
+; VI-NEXT:    v_mov_b32_e32 v10, s10
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    v_movreld_b32_e32 v0, 5
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <11 x i32> %a, i32 5, i32 %b
+  store <11 x i32> %vecins, <11 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind {
+; SI-LABEL: dynamic_insertelement_v12i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x18
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    v_mov_b32_e32 v11, s19
+; SI-NEXT:    s_mov_b32 m0, s4
+; SI-NEXT:    v_movreld_b32_e32 v0, 5
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_insertelement_v12i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x60
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v8, s8
+; VI-NEXT:    v_mov_b32_e32 v9, s9
+; VI-NEXT:    v_mov_b32_e32 v10, s10
+; VI-NEXT:    v_mov_b32_e32 v11, s11
+; VI-NEXT:    s_mov_b32 m0, s4
+; VI-NEXT:    v_movreld_b32_e32 v0, 5
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <12 x i32> %a, i32 5, i32 %b
+  store <12 x i32> %vecins, <12 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
@@ -934,7 +1510,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)*
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -973,7 +1549,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1023,7 +1599,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
@@ -1065,7 +1641,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
 
 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
 ; isTypeDesirableForOp in SimplifyDemandedBits
-define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
@@ -1110,7 +1686,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
@@ -1149,7 +1725,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
   ret void
 }
 
-define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
+define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
 ; SI-LABEL: s_dynamic_insertelement_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1201,7 +1777,7 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
@@ -1410,24 +1986,24 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
-define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; SI-LABEL: insert_split_bb:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s6, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB30_4
+; SI-NEXT:    s_cbranch_scc0 .LBB42_4
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_load_dword s7, s[2:3], 0x1
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB30_3
-; SI-NEXT:  .LBB30_2: ; %if
+; SI-NEXT:    s_cbranch_vccnz .LBB42_3
+; SI-NEXT:  .LBB42_2: ; %if
 ; SI-NEXT:    s_load_dword s7, s[2:3], 0x0
-; SI-NEXT:  .LBB30_3: ; %endif
+; SI-NEXT:  .LBB42_3: ; %endif
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
@@ -1435,8 +2011,8 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB30_4:
-; SI-NEXT:    s_branch .LBB30_2
+; SI-NEXT:  .LBB42_4:
+; SI-NEXT:    s_branch .LBB42_2
 ;
 ; VI-LABEL: insert_split_bb:
 ; VI:       ; %bb.0: ; %entry
@@ -1444,14 +2020,14 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s6, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB30_4
+; VI-NEXT:    s_cbranch_scc0 .LBB42_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_load_dword s7, s[2:3], 0x4
-; VI-NEXT:    s_cbranch_execnz .LBB30_3
-; VI-NEXT:  .LBB30_2: ; %if
+; VI-NEXT:    s_cbranch_execnz .LBB42_3
+; VI-NEXT:  .LBB42_2: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s7, s[2:3], 0x0
-; VI-NEXT:  .LBB30_3: ; %endif
+; VI-NEXT:  .LBB42_3: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
@@ -1459,8 +2035,8 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB30_4:
-; VI-NEXT:    s_branch .LBB30_2
+; VI-NEXT:  .LBB42_4:
+; VI-NEXT:    s_branch .LBB42_2
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %1 = icmp eq i32 %a, 0
@@ -1483,7 +2059,7 @@ endif:
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0x18
@@ -1530,7 +2106,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
@@ -1577,7 +2153,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
@@ -1638,7 +2214,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
@@ -1709,7 +2285,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
+define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: dynamic_insertelement_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x20

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
index fa5e1b6c34540..f189a0ffffd4f 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
@@ -1,19 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
 ; Make sure the expected regmask is generated for sub/superregisters.
 
-; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
+; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
 define void @csr() #0 {
   call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
   ret void
 }
 
-; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
+; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
 define void @subregs_for_super() #0 {
   call void asm sideeffect "", "~{v0},~{v1}"() #0
   ret void
 }
 
-; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
+; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
 define void @clobbered_reg_with_sub() #0 {
   call void asm sideeffect "", "~{v[0:1]}"() #0
   ret void
@@ -44,3 +45,5 @@ define void @vcc() #0 {
                                          i8* bitcast (void ()* @vcc to i8*)]
 
 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 09e97d485782a..c03cf9edc44be 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -2286,57 +2286,56 @@ define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5
 ;
 ; VI-LABEL: v5i64_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x84
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_add_u32 s8, s2, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    s_addc_u32 s9, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_mov_b32_e32 v5, s9
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    s_add_u32 s12, s8, 32
+; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    s_addc_u32 s13, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s12
+; VI-NEXT:    v_mov_b32_e32 v2, s11
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s13
+; VI-NEXT:    s_add_u32 s4, s8, 16
+; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_addc_u32 s5, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_add_u32 s2, s2, 32
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v5i64_arg:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v5i64_arg:
@@ -2429,57 +2428,56 @@ define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out,
 ;
 ; VI-LABEL: v5f64_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x84
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_add_u32 s8, s2, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    s_addc_u32 s9, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_mov_b32_e32 v5, s9
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    s_add_u32 s12, s8, 32
+; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    s_addc_u32 s13, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s12
+; VI-NEXT:    v_mov_b32_e32 v2, s11
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s13
+; VI-NEXT:    s_add_u32 s4, s8, 16
+; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_addc_u32 s5, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_add_u32 s2, s2, 32
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v5f64_arg:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v5f64_arg:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
index 018d0388f5bfb..81cb2fd5d5013 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@@ -32,9 +32,9 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}sample_d_3d:
-; GFX1010-NSA: image_sample_d v[0:3], v[7:22],
+; GFX1010-NSA: image_sample_d v[0:3], v[7:15],
 ; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
-; GFX11-NSA: image_sample_d v[0:3], v[7:22],
+; GFX11-NSA: image_sample_d v[0:3], v[7:15],
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 90c357b3ea997..4be3180a24344 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1568,19 +1568,19 @@ main_body:
 define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
 ; VERDE-LABEL: sample_c_d_o_2darray_V1:
 ; VERDE:       ; %bb.0: ; %main_body
-; VERDE-NEXT:    image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
+; VERDE-NEXT:    image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da
 ; VERDE-NEXT:    s_waitcnt vmcnt(0)
 ; VERDE-NEXT:    ; return to shader part epilog
 ;
 ; GFX6789-LABEL: sample_c_d_o_2darray_V1:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
+; GFX6789-NEXT:    image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10PLUS:       ; %bb.0: ; %main_body
-; GFX10PLUS-NEXT:    image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10PLUS-NEXT:    image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 main_body:
@@ -1593,7 +1593,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v9, 0
 ; VERDE-NEXT:    v_mov_b32_e32 v10, v9
-; VERDE-NEXT:    image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
+; VERDE-NEXT:    image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da
 ; VERDE-NEXT:    s_mov_b32 s15, 0xf000
 ; VERDE-NEXT:    s_mov_b32 s14, -1
 ; VERDE-NEXT:    s_waitcnt vmcnt(0)
@@ -1608,7 +1608,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; GFX6789-NEXT:    v_mov_b32_e32 v12, v11
 ; GFX6789-NEXT:    v_mov_b32_e32 v9, v11
 ; GFX6789-NEXT:    v_mov_b32_e32 v10, v12
-; GFX6789-NEXT:    image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
+; GFX6789-NEXT:    image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX6789-NEXT:    global_store_dword v11, v10, s[12:13]
@@ -1621,7 +1621,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v12
-; GFX10-NEXT:    image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX10-NEXT:    image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX10-NEXT:    global_store_dword v11, v10, s[12:13]
@@ -1633,7 +1633,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; GFX11-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v12, v11
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
-; GFX11-NEXT:    image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX11-NEXT:    image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX11-NEXT:    global_store_b32 v11, v10, s[12:13]
@@ -1650,19 +1650,19 @@ main_body:
 define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
 ; VERDE-LABEL: sample_c_d_o_2darray_V2:
 ; VERDE:       ; %bb.0: ; %main_body
-; VERDE-NEXT:    image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
+; VERDE-NEXT:    image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da
 ; VERDE-NEXT:    s_waitcnt vmcnt(0)
 ; VERDE-NEXT:    ; return to shader part epilog
 ;
 ; GFX6789-LABEL: sample_c_d_o_2darray_V2:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
+; GFX6789-NEXT:    image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10PLUS:       ; %bb.0: ; %main_body
-; GFX10PLUS-NEXT:    image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10PLUS-NEXT:    image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 main_body:
@@ -1676,7 +1676,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
 ; VERDE-NEXT:    v_mov_b32_e32 v9, 0
 ; VERDE-NEXT:    v_mov_b32_e32 v10, v9
 ; VERDE-NEXT:    v_mov_b32_e32 v11, v9
-; VERDE-NEXT:    image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da
+; VERDE-NEXT:    image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da
 ; VERDE-NEXT:    s_waitcnt vmcnt(0)
 ; VERDE-NEXT:    v_mov_b32_e32 v0, v9
 ; VERDE-NEXT:    v_mov_b32_e32 v1, v10
@@ -1688,7 +1688,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
 ; GFX6789-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v10, v9
 ; GFX6789-NEXT:    v_mov_b32_e32 v11, v9
-; GFX6789-NEXT:    image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da
+; GFX6789-NEXT:    image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX6789-NEXT:    v_mov_b32_e32 v1, v10
@@ -1700,7 +1700,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v9
-; GFX10-NEXT:    image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX10-NEXT:    image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v10
@@ -1712,7 +1712,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v10, v9
 ; GFX11-NEXT:    v_mov_b32_e32 v11, v9
-; GFX11-NEXT:    image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
+; GFX11-NEXT:    image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v11
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
index e813fcd8f5299..e4887b8b0f6c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
@@ -186,7 +186,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}sample_c_d_cl_o_2d:
-; GCN: image_sample_c_d_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}}
+; GCN: image_sample_c_d_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}}
 define amdgpu_ps <4 x float> @sample_c_d_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -250,7 +250,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl_o_2d:
-; GCN: image_sample_c_cd_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}}
+; GCN: image_sample_c_cd_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}}
 define amdgpu_ps <4 x float> @sample_c_cd_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 5c314044d5ff5..10b77e967a31d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -20,7 +20,7 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh_intersect_ray:
 ; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
@@ -90,7 +90,7 @@ main_body:
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh64_intersect_ray:
 ; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
@@ -128,7 +128,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr,
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX10-NEXT:    s_mov_b32 s15, s13
 ; GFX10-NEXT:    s_mov_b32 s13, s11
-; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
+; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[12:15] a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -182,7 +182,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -208,7 +208,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -370,7 +370,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 ; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -396,7 +396,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -461,7 +461,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 ; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -484,7 +484,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index fd5c94868fe95..dc246b42376a9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -9,7 +9,7 @@
 ; GCN: s_load_dword s{{[0-9]+}}
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(4)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64
-define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -31,7 +31,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(4)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -54,13 +54,69 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}constant_load_v9i32:
+; GCN: s_load_dword
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_32
+define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+  %ld = load <9 x i32>, <9 x i32> addrspace(4)* %in
+  store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v10i32:
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+  %ld = load <10 x i32>, <10 x i32> addrspace(4)* %in
+  store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v11i32:
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+  %ld = load <11 x i32>, <11 x i32> addrspace(4)* %in
+  store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v12i32:
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+entry:
+  %ld = load <12 x i32>, <12 x i32> addrspace(4)* %in
+  store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}constant_load_v16i32:
 ; GCN: s_load_dwordx16
 
@@ -68,7 +124,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -83,7 +139,7 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG: CF_END
 ; EG: VTX_READ_32
-define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load i32, i32 addrspace(4)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -100,7 +156,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out,
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load i32, i32 addrspace(4)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -110,7 +166,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out,
 ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
 ; GCN: s_load_dword
 ; GCN: store_dwordx2
-define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -121,7 +177,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
 ; GCN: store_dwordx2
-define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -131,7 +187,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -145,7 +201,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(
 ; GCN-DAG: s_ashr_i32
 
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -157,7 +213,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -174,7 +230,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -193,7 +249,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-SA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -221,7 +277,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -242,7 +298,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -269,7 +325,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA: {{flat|global}}_store_dwordx4
 ; GCN-HSA: {{flat|global}}_store_dwordx4
 ; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -321,7 +377,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 
-define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -372,7 +428,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -424,7 +480,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @constant_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
   store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index e0cad5af38dd2..dd516dcdd8a63 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -10,7 +10,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(1)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; R600: VTX_READ_64
-define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
 ; GCNX3-HSA: flat_load_dwordx3
 
 ; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
@@ -62,13 +62,89 @@ entry:
 
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}global_load_v9f32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dword
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_32
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in
+  store <9 x float> %tmp0, <9 x float> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}global_load_v10f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx2
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in
+  store <10 x float> %tmp0, <10 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v11f32:
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx3
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx3
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in
+  store <11 x float> %tmp0, <11 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v12f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in
+  store <12 x float> %tmp0, <12 x float> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}global_load_v16f32:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-NOHSA: buffer_load_dwordx4
@@ -84,7 +160,7 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index b47cbcb6e00e4..e5f25723cb62e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -10,7 +10,7 @@
 ; GCN-HSA: {{flat|global}}_load_dword
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(1)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: {{flat|global}}_load_dwordx2
 
 ; EG: VTX_READ_64
-define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
 ; GCNX3-HSA: {{flat|global}}_load_dwordx3
 
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 ; GCN-HSA: {{flat|global}}_load_dwordx4
 
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -62,13 +62,73 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}global_load_v9i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dword
+define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %ld = load <9 x i32>, <9 x i32> addrspace(1)* %in
+  store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v10i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx2
+define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %ld = load <10 x i32>, <10 x i32> addrspace(1)* %in
+  store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v11i32:
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx3
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx3
+define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %ld = load <11 x i32>, <11 x i32> addrspace(1)* %in
+  store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}global_load_v12i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+; GCN-HSA: {{flat|global}}_load_dwordx4
+define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+entry:
+  %ld = load <12 x i32>, <12 x i32> addrspace(1)* %in
+  store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}global_load_v16i32:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-NOHSA: buffer_load_dwordx4
@@ -84,7 +144,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -100,7 +160,7 @@ entry:
 ; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -119,7 +179,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i3
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -132,7 +192,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i3
 
 ; GCN-HSA: {{flat|global}}_load_dword
 ; GCN-HSA: {{flat|global}}_store_dwordx2
-define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -145,7 +205,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 ; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -158,7 +218,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
 
 ; GCN-HSA: {{flat|global}}_load_dwordx2
 ; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -174,7 +234,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)
 
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -189,7 +249,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)
 ; GCN-HSA: {{flat|global}}_load_dwordx4
 ; GCN-HSA: {{flat|global}}_store_dwordx4
 ; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -210,7 +270,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)
 
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -233,7 +293,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -265,7 +325,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -311,7 +371,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-DAG: v_ashrrev_i32
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -346,7 +406,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA: {{flat|global}}_store_dwordx4
 ; GCN-HSA: {{flat|global}}_store_dwordx4
 ; GCN-HSA: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -446,7 +506,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 
-define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -513,7 +573,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -581,7 +641,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-define amdgpu_kernel void @global_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
index 3c13cda7efca7..67861a395cd2c 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
@@ -714,7 +714,7 @@ body:             |
 
 
 # GFX11-LABEL: name: image_sample_c_d_cl_o_merged_v1v3
-# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V16_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V9_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
 # GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -726,9 +726,9 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_512 = IMPLICIT_DEF
-    %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_288 = IMPLICIT_DEF
+    %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index c815945d03d63..033afeb3adcac 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -5,14 +5,19 @@
 define amdgpu_kernel void @select_f16(
 ; SI-LABEL: select_f16:
 ; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s26, -1
+; SI-NEXT:    s_mov_b32 s27, 0xe8f000
+; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s6
 ; SI-NEXT:    s_mov_b32 s17, s7
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
@@ -34,6 +39,7 @@ define amdgpu_kernel void @select_f16(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -46,14 +52,19 @@ define amdgpu_kernel void @select_f16(
 ;
 ; VI-LABEL: select_f16:
 ; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s26, -1
+; VI-NEXT:    s_mov_b32 s27, 0xe80000
+; VI-NEXT:    s_add_u32 s24, s24, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s6
 ; VI-NEXT:    s_mov_b32 s17, s7
+; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s20, s8
 ; VI-NEXT:    s_mov_b32 s21, s9
@@ -75,6 +86,7 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_addc_u32 s25, s25, 0
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -420,14 +432,19 @@ entry:
 define amdgpu_kernel void @select_v2f16(
 ; SI-LABEL: select_v2f16:
 ; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s26, -1
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT:    s_mov_b32 s27, 0xe8f000
+; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s6
 ; SI-NEXT:    s_mov_b32 s17, s7
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
@@ -445,6 +462,7 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
@@ -474,8 +492,13 @@ define amdgpu_kernel void @select_v2f16(
 ;
 ; VI-LABEL: select_v2f16:
 ; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
+; VI-NEXT:    s_mov_b32 s26, -1
+; VI-NEXT:    s_mov_b32 s27, 0xe80000
+; VI-NEXT:    s_add_u32 s24, s24, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s14, s2
@@ -499,6 +522,7 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_addc_u32 s25, s25, 0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index b49be21442694..f89db580af945 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -67,14 +67,19 @@ entry:
 define amdgpu_kernel void @madak_f16_use_2(
 ; SI-LABEL: madak_f16_use_2:
 ; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s22, -1
+; SI-NEXT:    s_mov_b32 s23, 0xe8f000
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT:    s_add_u32 s20, s20, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s8
 ; SI-NEXT:    s_mov_b32 s17, s9
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s8, s10
 ; SI-NEXT:    s_mov_b32 s9, s11
@@ -91,6 +96,7 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x41200000
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_addc_u32 s21, s21, 0
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -106,14 +112,19 @@ define amdgpu_kernel void @madak_f16_use_2(
 ;
 ; VI-LABEL: madak_f16_use_2:
 ; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s22, -1
+; VI-NEXT:    s_mov_b32 s23, 0xe80000
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
+; VI-NEXT:    s_add_u32 s20, s20, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s8
 ; VI-NEXT:    s_mov_b32 s17, s9
+; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s8, s10
 ; VI-NEXT:    s_mov_b32 s9, s11
@@ -130,6 +141,7 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4900
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_addc_u32 s21, s21, 0
 ; VI-NEXT:    s_mov_b32 s8, s6
 ; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    v_madak_f16 v1, v0, v1, 0x4900

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
index 506d9cd123a95..e75a5c7aabb52 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
@@ -8,11 +8,11 @@ body: |
     ; GCN-LABEL: name: waitcnt-check-inorder
     ; GCN: S_WAITCNT 0
     ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     ; GCN-NEXT: S_ENDPGM 0
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     S_ENDPGM 0
 ...
 ---
@@ -22,11 +22,11 @@ body: |
     ; GCN-LABEL: name: waitcnt-check-vs-vmem
     ; GCN: S_WAITCNT 0
     ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     ; GCN-NEXT: S_WAITCNT 16240
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -37,11 +37,11 @@ body: |
     ; GCN-LABEL: name: waitcnt-check-vs-mimg-samp
     ; GCN: S_WAITCNT 0
     ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     ; GCN-NEXT: S_WAITCNT 16240
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
     ; GCN-NEXT: S_ENDPGM 0
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
     S_ENDPGM 0
 ...
@@ -54,10 +54,10 @@ body: |
     ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_WAITCNT 16240
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     ; GCN-NEXT: S_ENDPGM 0
     $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     S_ENDPGM 0
 ...
 ---
@@ -69,9 +69,9 @@ body: |
     ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
     ; GCN-NEXT: S_WAITCNT 16240
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     ; GCN-NEXT: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/MC/AMDGPU/gfx1013.s b/llvm/test/MC/AMDGPU/gfx1013.s
index 3b1f634b0d40e..b99265feaad9d 100644
--- a/llvm/test/MC/AMDGPU/gfx1013.s
+++ b/llvm/test/MC/AMDGPU/gfx1013.s
@@ -1,28 +1,28 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck %s
 
-image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11]
+image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11]
 // CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x02,0x00]
 
-image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] a16
-// CHECK: [0x01,0x9f,0x9c,0xf1,0xf0,0x05,0x02,0x40]
+image_bvh64_intersect_ray v[5:8], v[247:255], s[8:11] a16
+// CHECK: [0x01,0x9f,0x9c,0xf1,0xf7,0x05,0x02,0x40]
 
-image_bvh64_intersect_ray v[5:8], v[1:16], ttmp[12:15]
+image_bvh64_intersect_ray v[5:8], v[1:12], ttmp[12:15]
 // CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x1e,0x00]
 
 image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15]
-// CHECK: encoding: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00]
+// CHECK: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00]
 
-image_bvh_intersect_ray v[252:255], v[1:16], s[8:11]
+image_bvh_intersect_ray v[252:255], v[1:11], s[8:11]
 // CHECK: [0x01,0x9f,0x98,0xf1,0x01,0xfc,0x02,0x00]
 
 image_bvh_intersect_ray v[5:8], v[248:255], s[8:11] a16
 // CHECK: [0x01,0x9f,0x98,0xf1,0xf8,0x05,0x02,0x40]
 
-image_bvh_intersect_ray v[5:8], v[1:16], ttmp[12:15]
+image_bvh_intersect_ray v[5:8], v[1:11], ttmp[12:15]
 // CHECK: [0x01,0x9f,0x98,0xf1,0x01,0x05,0x1e,0x00]
 
 image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]
-// CHECK: encoding: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00]
+// CHECK: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00]
 
 image_msaa_load v[5:6], v[1:4], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
 // CHECK: [0x39,0x03,0x00,0xf0,0x01,0x05,0x02,0x00]

diff  --git a/llvm/test/MC/AMDGPU/gfx1030_new.s b/llvm/test/MC/AMDGPU/gfx1030_new.s
index c6eb05ba02ae0..bb99e6fc3eda5 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_new.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_new.s
@@ -84,16 +84,16 @@ v_fmac_legacy_f32 v0, |v1|, -v2
 v_fmac_legacy_f32 v0, s1, 2.0
 // GFX10: encoding: [0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00]
 
-image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
+image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
 // GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00]
 
 image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
 // GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40]
 
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
+image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
 // GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00]
 
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
+image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
 // GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40]
 
 image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]

diff  --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
index 5216bc4524a67..45ecf93a1d306 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
@@ -298,8 +298,8 @@ image_sample_d v[64:66], v[32:37], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG
 image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
 ; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x14,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x00,0x14,0x15]
 
-image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
-; GFX10: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03]
+image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
+; GFX10: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03]
 
 image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE
 ; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x1c,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x05,0x00,0x00]

diff  --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
index f44b864a6da85..b36722e0f8e43 100644
--- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
@@ -761,10 +761,10 @@ global_store_d16_hi_b8 v1, v2, s[104:105]
 global_store_dword_addtid v1, off offset:16 glc slc dlc
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
-image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16
+image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16
 // GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
-image_bvh_intersect_ray v[252:255], v[1:16], s[8:11]
+image_bvh_intersect_ray v[252:255], v[1:11], s[8:11]
 // GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 image_msaa_load v14, [v204,v11,v14,v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY

diff  --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
index 7d11e8b6e27b6..1f1cdd70e2dfc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
@@ -1248,23 +1248,23 @@ image_atomic_xor v[1:2], v[2:3], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA uno
 image_atomic_xor v[254:255], v[254:255], ttmp[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe
 // GFX11: [0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00]
 
-image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11]
+image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11]
 // GFX11: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00]
 
-image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11]
-// GFX11: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00]
+image_bvh64_intersect_ray v[5:8], v[244:255], s[8:11]
+// GFX11: [0x80,0x8f,0x68,0xf0,0xf4,0x05,0x02,0x00]
 
-image_bvh64_intersect_ray v[5:8], v[1:16], s[100:103] a16
+image_bvh64_intersect_ray v[5:8], v[1:9], s[100:103] a16
 // GFX11: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00]
 
-image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16
-// GFX11: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00]
+image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16
+// GFX11: [0x80,0x8f,0x69,0xf0,0xf7,0xfc,0x1e,0x00]
 
-image_bvh_intersect_ray v[5:8], v[1:16], s[8:11]
+image_bvh_intersect_ray v[5:8], v[1:11], s[8:11]
 // GFX11: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00]
 
-image_bvh_intersect_ray v[5:8], v[240:255], s[8:11]
-// GFX11: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00]
+image_bvh_intersect_ray v[5:8], v[245:255], s[8:11]
+// GFX11: [0x80,0x8f,0x64,0xf0,0xf5,0x05,0x02,0x00]
 
 image_bvh_intersect_ray v[5:8], v[1:8], s[100:103] a16
 // GFX11: [0x80,0x8f,0x65,0xf0,0x01,0x05,0x19,0x00]
@@ -3264,17 +3264,17 @@ image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
 image_sample_c_d v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c]
 
-image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c]
 
-image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_c_d v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x85,0xf0,0xf1,0x05,0x02,0x0c]
 
 image_sample_c_d v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
 // GFX11: [0x04,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
@@ -3336,17 +3336,17 @@ image_sample_c_d_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
 image_sample_c_d_cl v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c]
 
-image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_cl v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x10,0xf1,0xf1,0x05,0x02,0x0c]
 
-image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_cl v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_cl v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x11,0xf1,0xf1,0x05,0x02,0x0c]
 
 image_sample_c_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
 // GFX11: [0x04,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
@@ -3360,11 +3360,11 @@ image_sample_c_d_cl v[5:7], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
 image_sample_c_d_cl v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
 // GFX11: [0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c]
 
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
 // GFX11: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
-// GFX11: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_cl v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+// GFX11: [0x0c,0x03,0x12,0xf1,0xf1,0xff,0x02,0x0c]
 
 image_sample_c_d_cl v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
 // GFX11: [0x0c,0x03,0x13,0xf1,0x01,0x05,0x22,0x0c]
@@ -3384,11 +3384,11 @@ image_sample_c_d_cl v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_
 image_sample_c_d_cl v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
 // GFX11: [0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c]
 
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
 // GFX11: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
-// GFX11: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_cl v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+// GFX11: [0x14,0x04,0x12,0xf1,0xf1,0xff,0x02,0x0c]
 
 image_sample_c_d_cl v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
 // GFX11: [0x14,0x04,0x13,0xf1,0x01,0x05,0x38,0x64]
@@ -3408,11 +3408,11 @@ image_sample_c_d_cl_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_
 image_sample_c_d_cl_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_cl_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x50,0xf1,0xf1,0x05,0x02,0x0c]
 
 image_sample_c_d_cl_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x51,0xf1,0x01,0x05,0x02,0x0c]
@@ -3480,23 +3480,23 @@ image_sample_c_d_cl_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IM
 image_sample_c_d_cl_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o v[5:6], v[1:12], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o v[5:6], v[240:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_cl_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_cl_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
+image_sample_c_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
 // GFX11: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v[254:255], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
-// GFX11: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c]
+image_sample_c_d_cl_o v[254:255], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
+// GFX11: [0x04,0x03,0x28,0xf1,0xf1,0xfe,0x02,0x0c]
 
 image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
 // GFX11: [0x04,0x03,0x29,0xf1,0x01,0x05,0x22,0x0c]
@@ -3504,10 +3504,10 @@ image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IM
 image_sample_c_d_cl_o v[253:255], v[248:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
 // GFX11: [0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c]
 
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
 // GFX11: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
 // GFX11: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
 
 image_sample_c_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
@@ -3528,10 +3528,10 @@ image_sample_c_d_cl_o v[5:6], v[1:6], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IM
 image_sample_c_d_cl_o v[254:255], v[250:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
 // GFX11: [0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c]
 
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
 // GFX11: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
 // GFX11: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
 
 image_sample_c_d_cl_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
@@ -3552,10 +3552,10 @@ image_sample_c_d_cl_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSR
 image_sample_c_d_cl_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o_g16 v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_cl_o_g16 v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c]
 
 image_sample_c_d_cl_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
@@ -3696,16 +3696,16 @@ image_sample_c_d_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1
 image_sample_c_d_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c]
 
-image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_o v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0xac,0xf0,0xf1,0x05,0x02,0x0c]
 
-image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_c_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c]
 
 image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
@@ -3720,11 +3720,11 @@ image_sample_c_d_o v[5:7], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2
 image_sample_c_d_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
 // GFX11: [0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c]
 
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
 // GFX11: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
-// GFX11: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_o v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+// GFX11: [0x0c,0x03,0xae,0xf0,0xf1,0xff,0x02,0x0c]
 
 image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
 // GFX11: [0x0c,0x03,0xaf,0xf0,0x01,0x05,0x22,0x0c]
@@ -3744,11 +3744,11 @@ image_sample_c_d_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1
 image_sample_c_d_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
 // GFX11: [0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c]
 
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
 // GFX11: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
-// GFX11: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+image_sample_c_d_o v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+// GFX11: [0x14,0x04,0xae,0xf0,0xf1,0xff,0x02,0x0c]
 
 image_sample_c_d_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
 // GFX11: [0x14,0x04,0xaf,0xf0,0x01,0x05,0x38,0x64]
@@ -3768,11 +3768,11 @@ image_sample_c_d_o_g16 v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_I
 image_sample_c_d_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c]
 
-image_sample_c_d_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_c_d_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_c_d_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_c_d_o_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0xf0,0xf0,0xf1,0x05,0x02,0x0c]
 
 image_sample_c_d_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0xf1,0xf0,0x01,0x05,0x02,0x0c]
@@ -4344,11 +4344,11 @@ image_sample_d v[5:6], v[1:3], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a1
 image_sample_d v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c]
 
-image_sample_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_d v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x70,0xf0,0xf1,0x05,0x02,0x0c]
 
 image_sample_d v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x71,0xf0,0x01,0x05,0x02,0x0c]
@@ -4416,10 +4416,10 @@ image_sample_d_cl v[5:6], v[1:3], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
 image_sample_d_cl v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c]
 
-image_sample_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c]
 
 image_sample_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
@@ -4560,17 +4560,17 @@ image_sample_d_cl_o v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
 image_sample_d_cl_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c]
 
-image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_d_cl_o v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x1c,0xf1,0xf1,0x05,0x02,0x0c]
 
-image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_d_cl_o v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x1d,0xf1,0xf1,0x05,0x02,0x0c]
 
 image_sample_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
 // GFX11: [0x04,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
@@ -4584,11 +4584,11 @@ image_sample_d_cl_o v[5:7], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_
 image_sample_d_cl_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe
 // GFX11: [0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c]
 
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
 // GFX11: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
-// GFX11: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_d_cl_o v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16
+// GFX11: [0x0c,0x03,0x1e,0xf1,0xf1,0xff,0x02,0x0c]
 
 image_sample_d_cl_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16
 // GFX11: [0x0c,0x03,0x1f,0xf1,0x01,0x05,0x22,0x0c]
@@ -4608,11 +4608,11 @@ image_sample_d_cl_o v[5:6], v[1:5], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_
 image_sample_d_cl_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe
 // GFX11: [0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c]
 
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
 // GFX11: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
-// GFX11: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+image_sample_d_cl_o v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16
+// GFX11: [0x14,0x04,0x1e,0xf1,0xf1,0xff,0x02,0x0c]
 
 image_sample_d_cl_o v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16
 // GFX11: [0x14,0x04,0x1f,0xf1,0x01,0x05,0x38,0x64]
@@ -4632,11 +4632,11 @@ image_sample_d_cl_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_
 image_sample_d_cl_o_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c]
 
-image_sample_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_cl_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c]
 
-image_sample_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
-// GFX11: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c]
+image_sample_d_cl_o_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+// GFX11: [0x08,0x03,0x54,0xf1,0xf1,0x05,0x02,0x0c]
 
 image_sample_d_cl_o_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x55,0xf1,0x01,0x05,0x02,0x0c]
@@ -4776,17 +4776,17 @@ image_sample_d_o v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
 image_sample_d_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
 // GFX11: [0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c]
 
-image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
+image_sample_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D
 // GFX11: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c]
 
-image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+image_sample_d_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
 // GFX11: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c]
 
-image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-// GFX11: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c]
+image_sample_d_o v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
+// GFX11: [0x08,0x03,0x99,0xf0,0xf1,0x05,0x02,0x0c]
 
 image_sample_d_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D
 // GFX11: [0x04,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]

diff  --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
index 3c43bb0473711..4c65dc348a578 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s
@@ -163,8 +163,8 @@ image_sample_d v[64:66], [v32, v16, v8], s[4:11], s[100:103] dmask:0x7 dim:SQ_RS
 image_sample_d v[64:66], v[32:39], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D
 // GFX11: image_sample_d v[64:66], v[32:39], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
 
-image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
-// GFX11: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
+image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
+// GFX11: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
 
 image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY
 // GFX11: image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x07,0x70,0xf0,0x20,0x40,0x01,0x64,0x10,0x08,0x04,0x00]
@@ -286,17 +286,17 @@ image_msaa_load v[1:2], v[5:8], s[8:15] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
 image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
 // GFX11: image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00]
 
-image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
-// GFX11: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
+image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
+// GFX11: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
 
 image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
 // GFX11: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00]
 
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
-// GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
+image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
+// GFX11: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
 
-image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
-// GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
+image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
+// GFX11: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
 
 image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15]
 // GFX11: image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] ; encoding: [0x81,0x8f,0x64,0xf0,0x32,0x27,0x03,0x00,0x2e,0x14,0x28,0x2f]

diff  --git a/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
index 632a4fce4cd27..250c00a6321b0 100644
--- a/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx7_asm_mimg.s
@@ -1848,7 +1848,7 @@ image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x0
 image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -1947,7 +1947,7 @@ image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2619,7 +2619,7 @@ image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2712,7 +2712,7 @@ image_sample_c_d_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3381,7 +3381,7 @@ image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3474,7 +3474,7 @@ image_sample_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4137,7 +4137,7 @@ image_sample_c_d_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4230,7 +4230,7 @@ image_sample_c_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6069,7 +6069,7 @@ image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x0
 image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -6168,7 +6168,7 @@ image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6264,7 +6264,7 @@ image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6357,7 +6357,7 @@ image_sample_c_cd_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6453,7 +6453,7 @@ image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6546,7 +6546,7 @@ image_sample_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6639,7 +6639,7 @@ image_sample_c_cd_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6732,7 +6732,7 @@ image_sample_c_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm

diff  --git a/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
index e8f1f6ee04ac8..abcd4b700e583 100644
--- a/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx8_asm_mimg.s
@@ -1773,7 +1773,7 @@ image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x0
 image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -1875,7 +1875,7 @@ image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2568,7 +2568,7 @@ image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2664,7 +2664,7 @@ image_sample_c_d_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3354,7 +3354,7 @@ image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3450,7 +3450,7 @@ image_sample_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4134,7 +4134,7 @@ image_sample_c_d_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4230,7 +4230,7 @@ image_sample_c_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6156,7 +6156,7 @@ image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x0
 image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -6258,7 +6258,7 @@ image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6357,7 +6357,7 @@ image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6453,7 +6453,7 @@ image_sample_c_cd_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6552,7 +6552,7 @@ image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6648,7 +6648,7 @@ image_sample_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6744,7 +6744,7 @@ image_sample_c_cd_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6840,7 +6840,7 @@ image_sample_c_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm

diff  --git a/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
index 21e1f3dce964a..ed65976380a53 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_mimg.s
@@ -1851,7 +1851,7 @@ image_sample_d v5, v[1:3], s[8:15], s[12:15] dmask:0x0
 image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -1956,7 +1956,7 @@ image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2667,7 +2667,7 @@ image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -2766,7 +2766,7 @@ image_sample_c_d_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3477,7 +3477,7 @@ image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -3576,7 +3576,7 @@ image_sample_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4278,7 +4278,7 @@ image_sample_c_d_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -4377,7 +4377,7 @@ image_sample_c_d_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00]
 
 image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6399,7 +6399,7 @@ image_sample_cd v5, v[1:3], s[8:15], s[12:15] dmask:0x0
 image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
@@ -6504,7 +6504,7 @@ image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6606,7 +6606,7 @@ image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6705,7 +6705,7 @@ image_sample_c_cd_cl v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6807,7 +6807,7 @@ image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x0
 image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
@@ -6906,7 +6906,7 @@ image_sample_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -7005,7 +7005,7 @@ image_sample_c_cd_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm
@@ -7104,7 +7104,7 @@ image_sample_c_cd_cl_o v[5:8], v[1:8], s[8:15], s[12:15] dmask:0xf
 image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0
 // CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1
+image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1
 // CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00]
 
 image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
index 1162710aa6e6b..cdd7eeabf2fb5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt
@@ -75,16 +75,16 @@
 # GFX10: v_fmac_legacy_f32_e64 v0, s1, 2.0
 0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00
 
-# GFX10: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
+# GFX10: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
 0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00
 
 # GFX10: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
 0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40
 
-# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
+# GFX10: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
 0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00
 
-# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
+# GFX10: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
 0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40
 
 # GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
index 6778c10078924..8461a5ebfceb8 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt
@@ -1248,22 +1248,22 @@
 # GFX11: image_atomic_xor v[254:255], v[254:255], ttmp[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe ; encoding: [0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00]
 0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00
 
-# GFX11: image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00]
+# GFX11: image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00]
 0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00
 
-# GFX11: image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00]
+# GFX11: image_bvh64_intersect_ray v[5:8], v[240:251], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00]
 0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00
 
-# GFX11: image_bvh64_intersect_ray v[5:8], v[1:16], s[100:103] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00]
+# GFX11: image_bvh64_intersect_ray v[5:8], v[1:9], s[100:103] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00]
 0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00
 
-# GFX11: image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00]
+# GFX11: image_bvh64_intersect_ray v[252:255], v[240:248], ttmp[12:15] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00]
 0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00
 
-# GFX11: image_bvh_intersect_ray v[5:8], v[1:16], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00]
+# GFX11: image_bvh_intersect_ray v[5:8], v[1:11], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00]
 0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00
 
-# GFX11: image_bvh_intersect_ray v[5:8], v[240:255], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00]
+# GFX11: image_bvh_intersect_ray v[5:8], v[240:250], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00]
 0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00
 
 # GFX11: image_bvh_intersect_ray v[5:8], v[1:8], s[100:103] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x01,0x05,0x19,0x00]
@@ -3264,16 +3264,16 @@
 # GFX11: image_sample_c_d v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c]
 0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_c_d v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c]
@@ -3336,16 +3336,16 @@
 # GFX11: image_sample_c_d_cl v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c]
 0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c]
@@ -3360,10 +3360,10 @@
 # GFX11: image_sample_c_d_cl v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c]
 0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c
 
-# GFX11: image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c]
 0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c]
 0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x13,0xf1,0x01,0x05,0x22,0x0c]
@@ -3384,10 +3384,10 @@
 # GFX11: image_sample_c_d_cl v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c]
 0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c
 
-# GFX11: image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c]
 0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c]
 0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x13,0xf1,0x01,0x05,0x38,0x64]
@@ -3408,10 +3408,10 @@
 # GFX11: image_sample_c_d_cl_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c]
 0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x51,0xf1,0x01,0x05,0x02,0x0c]
@@ -3480,22 +3480,22 @@
 # GFX11: image_sample_c_d_cl_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c]
 0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[1:12], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[240:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c]
 0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v[254:255], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v[254:255], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c]
 0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x29,0xf1,0x01,0x05,0x22,0x0c]
@@ -3504,10 +3504,10 @@
 # GFX11: image_sample_c_d_cl_o v[253:255], v[248:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c]
 0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c
 
-# GFX11: image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c]
 0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
 0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x2b,0xf1,0x01,0x05,0x22,0x0c]
@@ -3528,10 +3528,10 @@
 # GFX11: image_sample_c_d_cl_o v[254:255], v[250:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c]
 0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c
 
-# GFX11: image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c]
 0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c]
 0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x2b,0xf1,0x01,0x05,0x38,0x64]
@@ -3552,10 +3552,10 @@
 # GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c]
 0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x59,0xf1,0x01,0x05,0x02,0x0c]
@@ -3696,16 +3696,16 @@
 # GFX11: image_sample_c_d_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c]
 0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c]
@@ -3720,10 +3720,10 @@
 # GFX11: image_sample_c_d_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c]
 0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c
 
-# GFX11: image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c]
 0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_o v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c]
 0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0xaf,0xf0,0x01,0x05,0x22,0x0c]
@@ -3744,10 +3744,10 @@
 # GFX11: image_sample_c_d_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c]
 0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c
 
-# GFX11: image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c]
 0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_c_d_o v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c]
 0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_c_d_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0xaf,0xf0,0x01,0x05,0x38,0x64]
@@ -3768,10 +3768,10 @@
 # GFX11: image_sample_c_d_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c]
 0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_c_d_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_c_d_o_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_c_d_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xf1,0xf0,0x01,0x05,0x02,0x0c]
@@ -4344,10 +4344,10 @@
 # GFX11: image_sample_d v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c]
 0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c
 
-# GFX11: image_sample_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_d v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x71,0xf0,0x01,0x05,0x02,0x0c]
@@ -4416,10 +4416,10 @@
 # GFX11: image_sample_d_cl v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c]
 0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x05,0xf1,0x01,0x05,0x02,0x0c]
@@ -4560,16 +4560,16 @@
 # GFX11: image_sample_d_cl_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c]
 0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c]
@@ -4584,10 +4584,10 @@
 # GFX11: image_sample_d_cl_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c]
 0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c
 
-# GFX11: image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c]
 0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
 0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_d_cl_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x1f,0xf1,0x01,0x05,0x22,0x0c]
@@ -4608,10 +4608,10 @@
 # GFX11: image_sample_d_cl_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c]
 0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c
 
-# GFX11: image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c]
 0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
+# GFX11: image_sample_d_cl_o v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c]
 0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c
 
 # GFX11: image_sample_d_cl_o v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x1f,0xf1,0x01,0x05,0x38,0x64]
@@ -4632,10 +4632,10 @@
 # GFX11: image_sample_d_cl_o_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c]
 0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_cl_o_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x55,0xf1,0x01,0x05,0x02,0x0c]
@@ -4776,16 +4776,16 @@
 # GFX11: image_sample_d_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c]
 0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c]
 0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c
 
-# GFX11: image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c]
+# GFX11: image_sample_d_o v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c]
 0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c
 
 # GFX11: image_sample_d_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c]

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
index cf05f42a0ef51..240b8a37b273d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
@@ -162,7 +162,7 @@
 # GFX11: image_sample_d v[64:66], v[32:37], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
 0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64
 
-# GFX11: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
+# GFX11: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64]
 0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64
 
 # GFX11: image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x07,0x70,0xf0,0x20,0x40,0x01,0x64,0x10,0x08,0x04,0x00]
@@ -282,16 +282,16 @@
 # GFX11: image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00]
 0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00
 
-# GFX11: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
+# GFX11: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00]
 0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00
 
 # GFX11: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00]
 0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00
 
-# GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
+# GFX11: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00]
 0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00
 
-# GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
+# GFX11: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00]
 0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00
 
 # GFX11: image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] ; encoding: [0x81,0x8f,0x64,0xf0,0x32,0x27,0x03,0x00,0x2e,0x14,0x28,0x2f]


        


More information about the llvm-commits mailing list