[llvm] 41507fe - [GISel] Combine (Scalarize) vector load followed by an element extract.

via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 5 21:53:28 PST 2023


Author: Pranav Taneja
Date: 2023-12-06T11:23:23+05:30
New Revision: 41507fe595d0fa3d81e151d70431d51897f8d14d

URL: https://github.com/llvm/llvm-project/commit/41507fe595d0fa3d81e151d70431d51897f8d14d
DIFF: https://github.com/llvm/llvm-project/commit/41507fe595d0fa3d81e151d70431d51897f8d14d.diff

LOG: [GISel] Combine (Scalarize) vector load followed by an element extract.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index ba72a3b71ffd7..a4e9c92b48976 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -196,6 +196,10 @@ class CombinerHelper {
   /// Match (and (load x), mask) -> zextload x
   bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Combine a G_EXTRACT_VECTOR_ELT of a load into a narrowed
+  /// load.
+  bool matchCombineExtractedVectorLoad(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
   void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
 

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9a84ab80157f3..77db371adaf77 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -259,6 +259,12 @@ def sext_inreg_to_zext_inreg : GICombineRule<
   }])
 >;
 
+def combine_extracted_vector_load : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+        [{ return Helper.matchCombineExtractedVectorLoad(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+  
 def combine_indexed_load_store : GICombineRule<
   (defs root:$root, indexed_load_store_matchdata:$matchinfo),
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -1291,8 +1297,8 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
                                            constant_fold_fp_binop]>;
 
 def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
-    extract_vec_elt_combines, combines_for_extload,
-    undef_combines, identity_combines, phi_combines,
+    extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load,
+    undef_combines, identity_combines, phi_combines, 
     simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
     reassocs, ptr_add_immed_chain,
     shl_ashr_to_sext_inreg, sext_inreg_of_load,

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index c2a7c2d011881..491eabc06f387 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1165,6 +1165,101 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
   return RealUse;
 }
 
+bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI,
+                                                     BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+
+  // Check if there is a load that defines the vector being extracted from.
+  auto *LoadMI = getOpcodeDef<GLoad>(MI.getOperand(1).getReg(), MRI);
+  if (!LoadMI)
+    return false;
+
+  Register Vector = MI.getOperand(1).getReg();
+  LLT VecEltTy = MRI.getType(Vector).getElementType();
+  LLT ResultTy = MRI.getType(MI.getOperand(0).getReg());
+
+  assert(ResultTy == VecEltTy);
+
+  // Checking whether we should reduce the load width.
+  if (!MRI.hasOneNonDBGUse(Vector))
+    return false;
+
+  // Check if the defining load is simple.
+  if (!LoadMI->isSimple())
+    return false;
+
+  // If the vector element type is not a multiple of a byte then we are unable
+  // to correctly compute an address to load only the extracted element as a
+  // scalar.
+  if (!VecEltTy.isByteSized())
+    return false;
+
+  // Check if the new load that we are going to create is legal
+  // if we are in the post-legalization phase.
+  MachineMemOperand MMO = LoadMI->getMMO();
+  Align Alignment = MMO.getAlign();
+  MachinePointerInfo PtrInfo;
+  uint64_t Offset;
+
+  // Finding the appropriate PtrInfo if offset is a known constant.
+  // This is required to create the memory operand for the narrowed load.
+  // This machine memory operand object helps us infer about legality
+  // before we proceed to combine the instruction.
+  if (auto CVal = getIConstantVRegVal(Vector, MRI)) {
+    int Elt = CVal->getZExtValue();
+    // FIXME: should be (ABI size)*Elt.
+    Offset = VecEltTy.getSizeInBits() * Elt / 8;
+    PtrInfo = MMO.getPointerInfo().getWithOffset(Offset);
+  } else {
+    // Discard the pointer info except the address space because the memory
+    // operand can't represent this new access since the offset is variable.
+    Offset = VecEltTy.getSizeInBits() / 8;
+    PtrInfo = MachinePointerInfo(MMO.getPointerInfo().getAddrSpace());
+  }
+
+  Alignment = commonAlignment(Alignment, Offset);
+
+  Register VecPtr = LoadMI->getPointerReg();
+  LLT PtrTy = MRI.getType(VecPtr);
+
+  MachineFunction &MF = *MI.getMF();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, VecEltTy);
+
+  LegalityQuery::MemDesc MMDesc(*NewMMO);
+
+  LegalityQuery Q = {TargetOpcode::G_LOAD, {VecEltTy, PtrTy}, {MMDesc}};
+
+  if (!isLegalOrBeforeLegalizer(Q))
+    return false;
+
+  // Load must be allowed and fast on the target.
+  LLVMContext &C = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  unsigned Fast = 0;
+  if (!getTargetLowering().allowsMemoryAccess(C, DL, VecEltTy, *NewMMO,
+                                              &Fast) ||
+      !Fast)
+    return false;
+
+  Register Result = MI.getOperand(0).getReg();
+  Register Index = MI.getOperand(2).getReg();
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    GISelObserverWrapper DummyObserver;
+    LegalizerHelper Helper(B.getMF(), DummyObserver, B);
+    //// Get pointer to the vector element.
+    Register finalPtr = Helper.getVectorElementPointer(
+        LoadMI->getPointerReg(), MRI.getType(LoadMI->getOperand(0).getReg()),
+        Index);
+    // New G_LOAD instruction.
+    B.buildLoad(Result, finalPtr, PtrInfo, Alignment);
+    // Remove original GLOAD instruction.
+    LoadMI->eraseFromParent();
+  };
+
+  return true;
+}
+
 bool CombinerHelper::matchCombineIndexedLoadStore(
     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
   auto &LdSt = cast<GLoadStore>(MI);

diff  --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 0d7620d1c883d..7493afd672d43 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14659,17 +14659,9 @@ define i8 @load_single_extract_variable_index_i8(ptr %A, i32 %idx) {
 ;
 ; CHECK-GISEL-LABEL: load_single_extract_variable_index_i8:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    sub sp, sp, #16
-; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    mov w9, w1
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and x9, x9, #0xf
-; CHECK-GISEL-NEXT:    lsl x10, x9, #1
-; CHECK-GISEL-NEXT:    str q0, [sp]
-; CHECK-GISEL-NEXT:    sub x9, x10, x9
-; CHECK-GISEL-NEXT:    ldrb w0, [x8, x9]
-; CHECK-GISEL-NEXT:    add sp, sp, #16
+; CHECK-GISEL-NEXT:    mov w8, w1
+; CHECK-GISEL-NEXT:    and x8, x8, #0xf
+; CHECK-GISEL-NEXT:    ldrb w0, [x0, x8]
 ; CHECK-GISEL-NEXT:    ret
   %lv = load <16 x i8>, ptr %A
   %e = extractelement <16 x i8> %lv, i32 %idx
@@ -14692,15 +14684,9 @@ define i16 @load_single_extract_variable_index_i16(ptr %A, i32 %idx) {
 ;
 ; CHECK-GISEL-LABEL: load_single_extract_variable_index_i16:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    sub sp, sp, #16
-; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    mov w9, w1
-; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and x9, x9, #0x7
-; CHECK-GISEL-NEXT:    str q0, [sp]
-; CHECK-GISEL-NEXT:    ldrh w0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT:    add sp, sp, #16
+; CHECK-GISEL-NEXT:    mov w8, w1
+; CHECK-GISEL-NEXT:    and x8, x8, #0x7
+; CHECK-GISEL-NEXT:    ldrh w0, [x0, x8, lsl #1]
 ; CHECK-GISEL-NEXT:    ret
   %lv = load <8 x i16>, ptr %A
   %e = extractelement <8 x i16> %lv, i32 %idx
@@ -14717,15 +14703,9 @@ define i32 @load_single_extract_variable_index_i32(ptr %A, i32 %idx) {
 ;
 ; CHECK-GISEL-LABEL: load_single_extract_variable_index_i32:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    sub sp, sp, #16
-; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    mov w9, w1
-; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and x9, x9, #0x3
-; CHECK-GISEL-NEXT:    str q0, [sp]
-; CHECK-GISEL-NEXT:    ldr w0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT:    add sp, sp, #16
+; CHECK-GISEL-NEXT:    mov w8, w1
+; CHECK-GISEL-NEXT:    and x8, x8, #0x3
+; CHECK-GISEL-NEXT:    ldr w0, [x0, x8, lsl #2]
 ; CHECK-GISEL-NEXT:    ret
   %lv = load <4 x i32>, ptr %A
   %e = extractelement <4 x i32> %lv, i32 %idx
@@ -14779,14 +14759,8 @@ define i32 @load_single_extract_variable_index_masked_i32(ptr %A, i32 %idx) {
 ;
 ; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked_i32:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    sub sp, sp, #16
-; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and w9, w1, #0x3
-; CHECK-GISEL-NEXT:    str q0, [sp]
-; CHECK-GISEL-NEXT:    ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT:    add sp, sp, #16
+; CHECK-GISEL-NEXT:    and w8, w1, #0x3
+; CHECK-GISEL-NEXT:    ldr w0, [x0, w8, uxtw #2]
 ; CHECK-GISEL-NEXT:    ret
   %idx.x = and i32 %idx, 3
   %lv = load <4 x i32>, ptr %A
@@ -14803,14 +14777,8 @@ define i32 @load_single_extract_variable_index_masked2_i32(ptr %A, i32 %idx) {
 ;
 ; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked2_i32:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    sub sp, sp, #16
-; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and w9, w1, #0x1
-; CHECK-GISEL-NEXT:    str q0, [sp]
-; CHECK-GISEL-NEXT:    ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT:    add sp, sp, #16
+; CHECK-GISEL-NEXT:    and w8, w1, #0x1
+; CHECK-GISEL-NEXT:    ldr w0, [x0, w8, uxtw #2]
 ; CHECK-GISEL-NEXT:    ret
   %idx.x = and i32 %idx, 1
   %lv = load <4 x i32>, ptr %A

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 69346de9bb798..80dc3dead35ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -124,71 +124,71 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
 ; GFX9-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -202,71 +202,71 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
 ; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 33a4d3c5494f7..a13c60b4e8414 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -8,170 +8,12 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v64i32_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
-; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_and_b32_e32 v0, 63, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v16, v20
-; GCN-NEXT:    v_mov_b32_e32 v17, v21
-; GCN-NEXT:    v_mov_b32_e32 v18, v22
-; GCN-NEXT:    v_mov_b32_e32 v19, v23
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    v_and_b32_e32 v2, 63, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    global_load_dword v0, v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
@@ -183,174 +25,12 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v128i16_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
-; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_bfe_u32 v0, v6, 1, 6
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v16, v20
-; GCN-NEXT:    v_mov_b32_e32 v17, v21
-; GCN-NEXT:    v_mov_b32_e32 v18, v22
-; GCN-NEXT:    v_mov_b32_e32 v19, v23
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s4
-; GCN-NEXT:    s_waitcnt vmcnt(16)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    global_load_ushort v0, v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <128 x i16>, ptr addrspace(1) %ptr
@@ -362,171 +42,12 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v32i64_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
-; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_and_b32_e32 v0, 31, v6
-; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v16, v20
-; GCN-NEXT:    v_mov_b32_e32 v17, v21
-; GCN-NEXT:    v_mov_b32_e32 v18, v22
-; GCN-NEXT:    v_mov_b32_e32 v19, v23
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    v_and_b32_e32 v2, 31, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <32 x i64>, ptr addrspace(1) %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 786d65f7dcc40..057790617204c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -8,29 +8,36 @@
 define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
 ; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
-; GCN-NEXT:    s_lshl_b32 m0, s4, 1
+; GCN-NEXT:    s_and_b32 s0, s4, 3
+; GCN-NEXT:    s_lshl_b32 s0, s0, 4
+; GCN-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-NEXT:    s_add_u32 s0, s2, s0
+; GCN-NEXT:    s_addc_u32 s1, s3, s1
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_movrels_b64 s[0:1], s[8:9]
-; GCN-NEXT:    s_movrels_b64 s[2:3], s[10:11]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
-; GFX10-NEXT:    s_lshl_b32 m0, s4, 1
+; GFX10-NEXT:    s_and_b32 s0, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_add_u32 s0, s2, s0
+; GFX10-NEXT:    s_addc_u32 s1, s3, s1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_movrels_b64 s[0:1], s[8:9]
-; GFX10-NEXT:    s_movrels_b64 s[2:3], s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b512 s[8:23], s[2:3], 0x0
-; GFX11-NEXT:    s_lshl_b32 m0, s4, 1
+; GFX11-NEXT:    s_and_b32 s0, s4, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_movrels_b64 s[0:1], s[8:9]
-; GFX11-NEXT:    s_movrels_b64 s[2:3], s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i128>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i128> %vector, i32 %idx
@@ -40,46 +47,32 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre
 define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX9-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX9-NEXT:    s_lshl_b32 s2, s0, 1
+; GFX9-NEXT:    s_and_b32 s0, s2, 3
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v18, v2
-; GFX9-NEXT:    s_set_gpr_idx_off
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v3
-; GFX9-NEXT:    s_set_gpr_idx_off
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v18
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
-; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[6:7]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
-; GFX8-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
-; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX8-NEXT:    s_lshl_b32 m0, s0, 1
+; GFX8-NEXT:    s_and_b32 s0, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_movrels_b32_e32 v1, v3
-; GFX8-NEXT:    v_movrels_b32_e32 v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
@@ -88,20 +81,13 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX7-NEXT:    s_lshl_b32 m0, s0, 1
+; GFX7-NEXT:    s_and_b32 s0, s2, 3
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX7-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_movrels_b32_e32 v1, v3
-; GFX7-NEXT:    v_movrels_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
@@ -110,44 +96,38 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX10-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX10-NEXT:    s_lshl_b32 m0, s0, 1
+; GFX10-NEXT:    s_and_b32 s0, s2, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_movrels_b32_e32 v1, v3
-; GFX10-NEXT:    v_movrels_b32_e32 v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:48
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_lshl_b32 m0, s0, 1
+; GFX11-NEXT:    s_and_b32 s0, s2, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_movrels_b32_e32 v0, v2
-; GFX11-NEXT:    v_movrels_b32_e32 v1, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 %idx
@@ -158,298 +138,66 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_add_u32_e32 v16, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v15, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 1, v2
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v16
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v12, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v15, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v11, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v6, v12, v8, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v14, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v15, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v12, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v13, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, v12, v14, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v13, v15, s4
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 2, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v19, v5, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 4, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 7, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v17, v19, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v18, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v19, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 4, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 5, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 6, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v14, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 7, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v15, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 %idx
@@ -459,68 +207,15 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_add_u32_e32 v19, 1, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-NEXT:    v_mov_b32_e32 v8, s7
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-NEXT:    v_mov_b32_e32 v10, s9
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v11, s10
-; GFX9-NEXT:    v_mov_b32_e32 v12, s11
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX9-NEXT:    v_mov_b32_e32 v13, s12
-; GFX9-NEXT:    v_mov_b32_e32 v14, s13
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX9-NEXT:    v_mov_b32_e32 v15, s14
-; GFX9-NEXT:    v_mov_b32_e32 v16, s15
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
@@ -529,68 +224,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
 ;
 ; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    v_mov_b32_e32 v6, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v7, s6
-; GFX8-NEXT:    v_mov_b32_e32 v8, s7
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v9, s8
-; GFX8-NEXT:    v_mov_b32_e32 v10, s9
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v11, s10
-; GFX8-NEXT:    v_mov_b32_e32 v12, s11
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX8-NEXT:    v_mov_b32_e32 v13, s12
-; GFX8-NEXT:    v_mov_b32_e32 v14, s13
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX8-NEXT:    v_mov_b32_e32 v15, s14
-; GFX8-NEXT:    v_mov_b32_e32 v16, s15
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
@@ -599,68 +241,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
 ;
 ; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s3
-; GFX7-NEXT:    v_mov_b32_e32 v5, s4
-; GFX7-NEXT:    v_mov_b32_e32 v6, s5
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v7, s6
-; GFX7-NEXT:    v_mov_b32_e32 v8, s7
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v9, s8
-; GFX7-NEXT:    v_mov_b32_e32 v10, s9
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v11, s10
-; GFX7-NEXT:    v_mov_b32_e32 v12, s11
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX7-NEXT:    v_mov_b32_e32 v13, s12
-; GFX7-NEXT:    v_mov_b32_e32 v14, s13
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 1, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX7-NEXT:    v_mov_b32_e32 v15, s14
-; GFX7-NEXT:    v_mov_b32_e32 v16, s15
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
@@ -669,54 +258,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, s4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, s5, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s4, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s5, v3, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s9, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 3, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s11, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 4, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 5, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 6, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s16, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s16, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s17, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 7, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, s18, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, s19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s19, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
@@ -725,63 +275,18 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b512 s[4:19], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s7 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_add_nc_u32 v1, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, s4, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, s5, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, s4, v2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, s5, v3, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s9, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s8, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s9, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s11, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s11, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 4, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s12, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 5, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s14, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 6, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s16, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s16, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s17, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 7, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, s18, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, s19, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s19, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -793,19 +298,19 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
 ; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i128>, ptr addrspace(4) %ptr
@@ -814,34 +319,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p
 }
 
 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mov_b32 s2, s6
-; GCN-NEXT:    s_mov_b32 s3, s7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x10
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
-; GFX10-NEXT:    s_mov_b32 s2, s6
-; GFX10-NEXT:    s_mov_b32 s3, s7
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x10
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s4
-; GFX11-NEXT:    s_mov_b32 s1, s5
-; GFX11-NEXT:    s_mov_b32 s2, s6
-; GFX11-NEXT:    s_mov_b32 s3, s7
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i128>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i128> %vector, i32 1
@@ -849,34 +354,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p
 }
 
 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s8
-; GCN-NEXT:    s_mov_b32 s1, s9
-; GCN-NEXT:    s_mov_b32 s2, s10
-; GCN-NEXT:    s_mov_b32 s3, s11
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x20
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x20
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x20
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s8
-; GFX10-NEXT:    s_mov_b32 s1, s9
-; GFX10-NEXT:    s_mov_b32 s2, s10
-; GFX10-NEXT:    s_mov_b32 s3, s11
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x20
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s8
-; GFX11-NEXT:    s_mov_b32 s1, s9
-; GFX11-NEXT:    s_mov_b32 s2, s10
-; GFX11-NEXT:    s_mov_b32 s3, s11
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i128>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i128> %vector, i32 2
@@ -884,34 +389,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p
 }
 
 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s12
-; GCN-NEXT:    s_mov_b32 s1, s13
-; GCN-NEXT:    s_mov_b32 s2, s14
-; GCN-NEXT:    s_mov_b32 s3, s15
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x30
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x30
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0xc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x30
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s12
-; GFX10-NEXT:    s_mov_b32 s1, s13
-; GFX10-NEXT:    s_mov_b32 s2, s14
-; GFX10-NEXT:    s_mov_b32 s3, s15
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x30
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s12
-; GFX11-NEXT:    s_mov_b32 s1, s13
-; GFX11-NEXT:    s_mov_b32 s2, s14
-; GFX11-NEXT:    s_mov_b32 s3, s15
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i128>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i128> %vector, i32 3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 7028d1157787f..6d772df3fa281 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -6,42 +6,74 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
 
 define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_lshr_b32 s2, s4, 1
-; GCN-NEXT:    s_cmp_eq_u32 s2, 1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_and_b32 s1, s4, 1
-; GCN-NEXT:    s_lshl_b32 s1, s1, 4
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_u32 s0, s2, s0
+; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    s_add_u32 s0, s2, s0
+; GFX8-NEXT:    s_addc_u32 s1, s3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, s2, 1
+; GFX7-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_lshr_b32 s2, s4, 1
-; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX10-NEXT:    s_and_b32 s1, s4, 1
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, s4, 3
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_add_u32 s0, s2, s0
+; GFX10-NEXT:    s_addc_u32 s1, s3, s1
+; GFX10-NEXT:    global_load_ushort v0, v0, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_lshr_b32 s2, s4, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX11-NEXT:    s_and_b32 s1, s4, 1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX11-NEXT:    s_and_b32 s0, s4, 3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i16> %vector, i32 %idx
@@ -51,71 +83,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg
 define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX9-NEXT:    s_and_b32 s1, s2, 1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX9-NEXT:    s_and_b32 s0, s2, 3
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX8-NEXT:    s_and_b32 s1, s2, 1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX8-NEXT:    s_and_b32 s0, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX7-NEXT:    s_and_b32 s1, s2, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX7-NEXT:    s_and_b32 s0, s2, 3
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX7-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT:    s_and_b32 s0, s2, 1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX10-NEXT:    s_and_b32 s0, s2, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX11-NEXT:    s_and_b32 s0, s2, 1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX11-NEXT:    s_and_b32 s0, s2, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
@@ -127,70 +159,66 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 %idx
@@ -198,48 +226,74 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(4) %ptr
@@ -248,22 +302,48 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i16_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i16_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i16> %vector, i32 0
@@ -271,25 +351,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 2
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i16_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i16_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i16> %vector, i32 1
@@ -297,25 +402,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 4
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i16_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i16_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i16> %vector, i32 2
@@ -323,25 +453,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s1, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 6
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i16_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i16_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:6
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i16> %vector, i32 3
@@ -352,14 +507,14 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -369,21 +524,21 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
@@ -395,17 +550,17 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
@@ -414,25 +569,22 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 1
@@ -443,17 +595,17 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
@@ -462,25 +614,22 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 2
@@ -491,17 +640,17 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
@@ -510,25 +659,22 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:6
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 3
@@ -536,54 +682,74 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_lshr_b32 s5, s4, 1
-; GCN-NEXT:    s_cmp_eq_u32 s5, 1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_cmp_eq_u32 s5, 2
-; GCN-NEXT:    s_cselect_b32 s0, s2, s0
-; GCN-NEXT:    s_cmp_eq_u32 s5, 3
-; GCN-NEXT:    s_cselect_b32 s0, s3, s0
-; GCN-NEXT:    s_and_b32 s1, s4, 1
-; GCN-NEXT:    s_lshl_b32 s1, s1, 4
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s4, 7
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_u32 s0, s2, s0
+; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s4, 7
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    s_add_u32 s0, s2, s0
+; GFX8-NEXT:    s_addc_u32 s1, s3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s4, 7
+; GFX7-NEXT:    s_lshl_b32 s4, s2, 1
+; GFX7-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_lshr_b32 s5, s4, 1
-; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX10-NEXT:    s_cselect_b32 s0, s3, s0
-; GFX10-NEXT:    s_and_b32 s1, s4, 1
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, s4, 7
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_add_u32 s0, s2, s0
+; GFX10-NEXT:    s_addc_u32 s1, s3, s1
+; GFX10-NEXT:    global_load_ushort v0, v0, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_lshr_b32 s5, s4, 1
+; GFX11-NEXT:    s_and_b32 s0, s4, 7
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX11-NEXT:    s_cselect_b32 s0, s3, s0
-; GFX11-NEXT:    s_and_b32 s1, s4, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 %idx
@@ -593,92 +759,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg
 define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_and_b32 s0, s2, 7
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    s_lshl_b32 s0, s1, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_and_b32 s0, s2, 7
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    s_lshl_b32 s0, s1, 4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_and_b32 s0, s2, 7
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX7-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT:    s_lshl_b32 s0, s1, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT:    s_and_b32 s0, s2, 7
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT:    s_and_b32 s0, s2, 1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX11-NEXT:    s_and_b32 s0, s2, 7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX11-NEXT:    s_and_b32 s0, s2, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
@@ -690,91 +835,66 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 %idx
@@ -782,64 +902,74 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
@@ -848,22 +978,48 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 0
@@ -871,25 +1027,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 2
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 1
@@ -897,25 +1078,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 4
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 2
@@ -923,25 +1129,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s1, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 6
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:6
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 3
@@ -949,25 +1180,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s2
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 8
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 4
@@ -975,25 +1231,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx5:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s2, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 10
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:10
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:10
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 5
@@ -1001,25 +1282,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 12
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx6:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx6:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:12
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 6
@@ -1027,25 +1333,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr
 }
 
 define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s3, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3] offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 14
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i16_idx7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3] offset:14
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i16_idx7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:14
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i16>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i16> %vector, i32 7
@@ -1056,14 +1387,14 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1073,21 +1404,21 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
@@ -1099,17 +1430,17 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx1:
@@ -1118,25 +1449,22 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 1
@@ -1147,17 +1475,17 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx2:
@@ -1166,25 +1494,22 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 2
@@ -1195,17 +1520,17 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx3:
@@ -1214,25 +1539,22 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:6
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 3
@@ -1243,17 +1565,17 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx4:
@@ -1262,25 +1584,22 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 4
@@ -1291,17 +1610,17 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx5:
@@ -1310,25 +1629,22 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:10
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:10
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 5
@@ -1339,17 +1655,17 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx6:
@@ -1358,25 +1674,22 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:12
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 6
@@ -1387,17 +1700,17 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:14
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx7:
@@ -1406,27 +1719,26 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 7
   ret i16 %element
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index c7f49d526fac0..c2394ec461490 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -6,32 +6,68 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
 
 define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT:    s_and_b32 s1, s4, 3
-; GCN-NEXT:    s_lshl_b32 s1, s1, 3
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s4, 3
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_u32 s0, s2, s0
+; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s4, 3
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    s_add_u32 s0, s2, s0
+; GFX8-NEXT:    s_addc_u32 s1, s3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s4, s4, 3
+; GFX7-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    s_and_b32 s1, s4, 3
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, s4, 3
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_add_u32 s0, s2, s0
+; GFX10-NEXT:    s_addc_u32 s1, s3, s1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT:    s_and_b32 s1, s4, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, s4, 3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i8> %vector, i32 %idx
@@ -41,55 +77,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p
 define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_and_b32 s0, s2, 3
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, s2, 3
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s0, s2, 3
-; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_and_b32 s0, s2, 3
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_and_b32 s0, s2, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
@@ -101,55 +147,60 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 %idx
@@ -159,53 +210,67 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshr_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
+; GFX10-NEXT:    v_and_b32_e32 v2, 3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 3, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(4) %ptr
@@ -214,22 +279,48 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i8_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i8_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i8> %vector, i32 0
@@ -237,25 +328,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 8
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 1
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i8_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i8_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i8> %vector, i32 1
@@ -263,25 +379,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 2
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i8_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i8_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i8> %vector, i32 2
@@ -289,25 +430,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 24
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 3
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i8_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v4i8_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <4 x i8> %vector, i32 3
@@ -318,14 +484,14 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -335,21 +501,21 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
@@ -361,17 +527,17 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx1:
@@ -380,25 +546,22 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 1
@@ -409,17 +572,17 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx2:
@@ -428,25 +591,22 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 2
@@ -457,17 +617,17 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx3:
@@ -476,25 +636,22 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 3
@@ -502,42 +659,68 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_lshr_b32 s2, s4, 2
-; GCN-NEXT:    s_cmp_eq_u32 s2, 1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_and_b32 s1, s4, 3
-; GCN-NEXT:    s_lshl_b32 s1, s1, 3
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s4, 7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_u32 s0, s2, s0
+; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s4, 7
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    s_add_u32 s0, s2, s0
+; GFX8-NEXT:    s_addc_u32 s1, s3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s4, s4, 7
+; GFX7-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_lshr_b32 s2, s4, 2
-; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX10-NEXT:    s_and_b32 s1, s4, 3
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, s4, 7
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_add_u32 s0, s2, s0
+; GFX10-NEXT:    s_addc_u32 s1, s3, s1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_lshr_b32 s2, s4, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX11-NEXT:    s_and_b32 s1, s4, 3
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, s4, 7
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 %idx
@@ -547,71 +730,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p
 define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX9-NEXT:    s_and_b32 s1, s2, 3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX9-NEXT:    s_and_b32 s0, s2, 7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX8-NEXT:    s_and_b32 s1, s2, 3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX8-NEXT:    s_and_b32 s0, s2, 7
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX7-NEXT:    s_and_b32 s1, s2, 3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX7-NEXT:    s_and_b32 s0, s2, 7
+; GFX7-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT:    s_and_b32 s0, s2, 3
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX10-NEXT:    s_and_b32 s0, s2, 7
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX11-NEXT:    s_and_b32 s0, s2, 3
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX11-NEXT:    s_and_b32 s0, s2, 7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
@@ -623,70 +800,60 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 %idx
@@ -694,48 +861,69 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
@@ -744,22 +932,48 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 0
@@ -767,25 +981,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 8
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 1
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 1
@@ -793,25 +1032,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 2
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 2
@@ -819,25 +1083,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s0, 24
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 3
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 3
@@ -845,25 +1134,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 4
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 4
@@ -871,25 +1185,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx5:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s1, 8
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 5
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:5
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 5
@@ -897,25 +1236,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s1, 16
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 6
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx6:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx6:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:6
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 6
@@ -923,25 +1287,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr)
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s1, 24
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_u32 s0, s2, 7
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v8i8_idx7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v8i8_idx7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:7
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <8 x i8> %vector, i32 7
@@ -952,14 +1341,14 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -969,21 +1358,21 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
@@ -995,17 +1384,17 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx1:
@@ -1014,25 +1403,22 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 1
@@ -1043,17 +1429,17 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx2:
@@ -1062,25 +1448,22 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 2
@@ -1091,17 +1474,17 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx3:
@@ -1110,25 +1493,22 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 3
@@ -1139,17 +1519,17 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx4:
@@ -1158,25 +1538,22 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 4
@@ -1187,17 +1564,17 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx5:
@@ -1206,25 +1583,22 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 5
@@ -1235,17 +1609,17 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx6:
@@ -1254,25 +1628,22 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:6
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 6
@@ -1283,17 +1654,17 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 7, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx7:
@@ -1302,25 +1673,22 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:7
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 7
@@ -1328,54 +1696,68 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    s_lshr_b32 s5, s4, 2
-; GCN-NEXT:    s_cmp_eq_u32 s5, 1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_cmp_eq_u32 s5, 2
-; GCN-NEXT:    s_cselect_b32 s0, s2, s0
-; GCN-NEXT:    s_cmp_eq_u32 s5, 3
-; GCN-NEXT:    s_cselect_b32 s0, s3, s0
-; GCN-NEXT:    s_and_b32 s1, s4, 3
-; GCN-NEXT:    s_lshl_b32 s1, s1, 3
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s4, 15
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_u32 s0, s2, s0
+; GFX9-NEXT:    s_addc_u32 s1, s3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s4, 15
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    s_add_u32 s0, s2, s0
+; GFX8-NEXT:    s_addc_u32 s1, s3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    s_lshr_b32 s5, s4, 2
-; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX10-NEXT:    s_cselect_b32 s0, s3, s0
-; GFX10-NEXT:    s_and_b32 s1, s4, 3
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, s4, 15
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    s_add_u32 s0, s2, s0
+; GFX10-NEXT:    s_addc_u32 s1, s3, s1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    s_lshr_b32 s5, s4, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX11-NEXT:    s_cselect_b32 s0, s3, s0
-; GFX11-NEXT:    s_and_b32 s1, s4, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, s4, 15
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <16 x i8>, ptr addrspace(4) %ptr
   %element = extractelement <16 x i8> %vector, i32 %idx
@@ -1385,92 +1767,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %
 define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT:    s_and_b32 s1, s2, 3
+; GFX9-NEXT:    s_and_b32 s0, s2, 15
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    s_lshl_b32 s0, s1, 3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    s_and_b32 s0, s2, 15
+; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT:    s_and_b32 s1, s2, 3
+; GFX7-NEXT:    s_and_b32 s0, s2, 15
+; GFX7-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT:    s_lshl_b32 s0, s1, 3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT:    s_and_b32 s0, s2, 3
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX11-NEXT:    s_and_b32 s0, s2, 15
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX11-NEXT:    s_and_b32 s0, s2, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1482,91 +1837,60 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 %idx
@@ -1574,64 +1898,69 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 }
 
 define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %vector = load <16 x i8>, ptr addrspace(4) %ptr
@@ -1643,14 +1972,14 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1660,21 +1989,21 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1686,17 +2015,17 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx1:
@@ -1705,25 +2034,22 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 1
@@ -1734,17 +2060,17 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx2:
@@ -1753,25 +2079,22 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 2
@@ -1782,17 +2105,17 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx3:
@@ -1801,25 +2124,22 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 3
@@ -1830,17 +2150,17 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx4:
@@ -1849,25 +2169,22 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 4
@@ -1878,17 +2195,17 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx5:
@@ -1897,25 +2214,22 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 5
@@ -1926,17 +2240,17 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx6:
@@ -1945,25 +2259,22 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:6
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 6
@@ -1974,17 +2285,17 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 7, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx7:
@@ -1993,25 +2304,22 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:7
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 7
@@ -2022,17 +2330,17 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx8:
@@ -2041,25 +2349,22 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 8
@@ -2070,17 +2375,17 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:9
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 9, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx9:
@@ -2089,25 +2394,22 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:9
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:9
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:9
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 9
@@ -2118,17 +2420,17 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx10:
@@ -2137,25 +2439,22 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:10
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:10
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 10
@@ -2166,17 +2465,17 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:11
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 11, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx11:
@@ -2185,25 +2484,22 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:11
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:11
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 11
@@ -2214,17 +2510,17 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx12:
@@ -2233,25 +2529,22 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:12
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 12
@@ -2262,17 +2555,17 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:13
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 13, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx13:
@@ -2281,25 +2574,22 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:13
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:13
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:13
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 13
@@ -2310,17 +2600,17 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:14
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx14:
@@ -2329,25 +2619,22 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 14
@@ -2358,17 +2645,17 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:15
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx15:
@@ -2377,27 +2664,26 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:15
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:15
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:15
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 15
   ret i8 %element
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 8bf34caea4051..d4c536bdd5ebe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -4738,35 +4738,31 @@ define i32 @v_extract_v64i32_7(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_7:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT:    global_load_dword v0, v[0:1], off offset:28
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v7
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_7:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 28, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT:    flat_load_dword v0, v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v7
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:28
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_7:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:28
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 7
@@ -4777,7 +4773,7 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_32:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GPRIDX-NEXT:    global_load_dword v0, v[0:1], off offset:128
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4786,21 +4782,21 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT:    flat_load_dword v0, v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:128
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:128
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
@@ -4812,35 +4808,31 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_33:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GPRIDX-NEXT:    global_load_dword v0, v[0:1], off offset:132
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_33:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x84, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT:    flat_load_dword v0, v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_33:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:132
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_33:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:128
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:132
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 33
@@ -4851,35 +4843,31 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_37:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
+; GPRIDX-NEXT:    global_load_dword v0, v[0:1], off offset:148
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v5
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_37:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x90, v0
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x94, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT:    flat_load_dword v0, v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v5
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_37:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:148
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_37:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:144
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:148
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 37


        


More information about the llvm-commits mailing list