[llvm] e002015 - GlobalISel: Implement fewerElementsVector for G_EXTRACT_VECTOR_ELT

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 6 11:33:26 PDT 2020


Author: Matt Arsenault
Date: 2020-08-06T14:33:16-04:00
New Revision: e00201539f9c5c540a037add790eb31032aaea8f

URL: https://github.com/llvm/llvm-project/commit/e00201539f9c5c540a037add790eb31032aaea8f
DIFF: https://github.com/llvm/llvm-project/commit/e00201539f9c5c540a037add790eb31032aaea8f.diff

LOG: GlobalISel: Implement fewerElementsVector for G_EXTRACT_VECTOR_ELT

Use the same basic strategy as LegalizeVectorTypes. Try to index into
smaller pieces if there's a constant index, and otherwise fall back to
a stack temporary.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 9ca7bf67e6d5..621c4cefe09d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -279,6 +279,9 @@ class LegalizerHelper {
   LegalizeResult fewerElementsVectorBuildVector(MachineInstr &MI,
                                                 unsigned TypeIdx,
                                                 LLT NarrowTy);
+  LegalizeResult fewerElementsVectorExtractVectorElt(MachineInstr &MI,
+                                                     unsigned TypeIdx,
+                                                     LLT NarrowTy);
 
   LegalizeResult
   reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy);

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a7d11d9c6c8f..b56d1a0b3f59 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3472,6 +3472,59 @@ LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI,
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
+                                                     unsigned TypeIdx,
+                                                     LLT NarrowVecTy) {
+  assert(TypeIdx == 1 && "not a vector type index");
+
+  // TODO: Handle total scalarization case.
+  if (!NarrowVecTy.isVector())
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register Idx = MI.getOperand(2).getReg();
+  LLT VecTy = MRI.getType(SrcVec);
+
+  // If the index is a constant, we can really break this down as you would
+  // expect, and index into the target size pieces.
+  int64_t IdxVal;
+  if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
+    // Avoid out of bounds indexing the pieces.
+    if (IdxVal >= VecTy.getNumElements()) {
+      MIRBuilder.buildUndef(DstReg);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    SmallVector<Register, 8> VecParts;
+    LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
+
+    // Build a sequence of NarrowTy pieces in VecParts for this operand.
+    buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
+                        TargetOpcode::G_ANYEXT);
+
+    unsigned NewNumElts = NarrowVecTy.getNumElements();
+
+    LLT IdxTy = MRI.getType(Idx);
+    int64_t PartIdx = IdxVal / NewNumElts;
+    auto NewIdx =
+        MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
+
+    MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  // With a variable index, we can't perform the extract in a smaller type, so
+  // we're forced to expand this.
+  //
+  // TODO: We could emit a chain of compare/select to figure out which piece to
+  // index.
+  return lowerExtractVectorElt(MI);
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
                                       LLT NarrowTy) {
@@ -3801,6 +3854,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
   case G_BUILD_VECTOR:
     return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy);
+  case G_EXTRACT_VECTOR_ELT:
+    return fewerElementsVectorExtractVectorElt(MI, TypeIdx, NarrowTy);
   case G_LOAD:
   case G_STORE:
     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 237d4595e10f..f84f58a6b5de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1340,7 +1340,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .clampScalar(EltTypeIdx, S32, S64)
       .clampScalar(VecTypeIdx, S32, S64)
       .clampScalar(IdxTypeIdx, S32, S32)
-      // TODO: Clamp the number of elements before resorting to stack lowering.
+      .clampMaxNumElements(1, S32, 32)
+      // TODO: Clamp elements for 64-bit vectors?
       // It should only be necessary with variable indexes.
       // As a last resort, lower to the stack
       .lower();

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
new file mode 100644
index 000000000000..d2d9bea66089
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -0,0 +1,861 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; Check lowering of some large extractelement that use the stack
+; instead of register indexing.
+
+define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
+; GCN-LABEL: v_extract_v64i32_varidx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v15, v0
+; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
+; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
+; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
+; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[15:16], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[31:32], off
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[31:32], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[31:32], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[31:32], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
+; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
+; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
+; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
+; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
+; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
+; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
+; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
+; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
+; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
+; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
+; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
+; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
+; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
+; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
+; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
+; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
+; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
+; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
+; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
+; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
+; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
+; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
+; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
+; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
+; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v15
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v9, v16
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v10, v17
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v11, v18
+; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
+; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
+; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
+; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
+; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
+; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
+; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
+; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xd4, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xd8, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xdc, v0
+; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v52, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v53, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v54, v5, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xe4, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xe8, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xec, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xf0, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 0xf4, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xf8, v0
+; GCN-NEXT:    v_add_u32_e32 v9, 0xfc, v0
+; GCN-NEXT:    buffer_store_dword v55, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v56, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v57, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v58, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v59, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v60, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v61, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v62, v9, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v1, 63, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
+  %elt = extractelement <64 x i32> %vec, i32 %idx
+  ret i32 %elt
+}
+
+define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
+; GCN-LABEL: v_extract_v128i16_varidx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v15, v0
+; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
+; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
+; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
+; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[15:16], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[31:32], off
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[31:32], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[31:32], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[31:32], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
+; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
+; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
+; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
+; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
+; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
+; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
+; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
+; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
+; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
+; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
+; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
+; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
+; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
+; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
+; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
+; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
+; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
+; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
+; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
+; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
+; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
+; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
+; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
+; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
+; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v15
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v9, v16
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v10, v17
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v11, v18
+; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
+; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
+; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
+; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
+; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
+; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
+; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
+; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 63, v1
+; GCN-NEXT:    v_add_u32_e32 v3, 0xd0, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xd4, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xd8, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xdc, v0
+; GCN-NEXT:    buffer_store_dword v51, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v52, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v53, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v54, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v3, 0xe0, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GCN-NEXT:    v_add_u32_e32 v4, 0xe4, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xe8, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xec, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 0xf0, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xf4, v0
+; GCN-NEXT:    v_add_u32_e32 v9, 0xf8, v0
+; GCN-NEXT:    v_add_u32_e32 v10, 0xfc, v0
+; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v55, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v56, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v57, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v58, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v59, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v60, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v61, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v62, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    s_waitcnt vmcnt(15)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <128 x i16>, <128 x i16> addrspace(1)* %ptr
+  %elt = extractelement <128 x i16> %vec, i32 %idx
+  ret i16 %elt
+}
+
+define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
+; GCN-LABEL: v_extract_v32i64_varidx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v15, v0
+; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
+; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
+; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
+; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[15:16], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[31:32], off
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[31:32], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[31:32], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[31:32], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
+; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
+; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
+; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
+; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
+; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
+; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
+; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
+; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
+; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
+; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
+; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
+; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
+; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
+; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
+; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
+; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
+; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
+; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
+; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
+; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
+; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
+; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
+; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
+; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
+; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v15
+; GCN-NEXT:    v_mov_b32_e32 v9, v16
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v11, v18
+; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v10, v17
+; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
+; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
+; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
+; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
+; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
+; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
+; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
+; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xd8, v0
+; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v53, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v3, 0xd4, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xe0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 0xf4, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xf8, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xdc, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 0xe4, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xe8, v0
+; GCN-NEXT:    v_add_u32_e32 v10, 0xf0, v0
+; GCN-NEXT:    buffer_store_dword v55, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v57, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v59, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v61, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v9, 0xec, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xfc, v0
+; GCN-NEXT:    buffer_store_dword v52, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v54, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v56, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v58, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v60, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v62, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v1, 31, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
+; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
+; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr
+  %elt = extractelement <32 x i64> %vec, i32 %idx
+  ret i64 %elt
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 4b78c605e0b7..6274f055fa27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2495,3 +2495,125 @@ entry:
   store double %ext, double addrspace(1)* %out
   ret void
 }
+
+define i32 @v_extract_v64i32_7(<64 x i32> addrspace(1)* %ptr) {
+; GPRIDX-LABEL: v_extract_v64i32_7:
+; GPRIDX:       ; %bb.0:
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v7
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: v_extract_v64i32_7:
+; MOVREL:       ; %bb.0:
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v7
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
+  %elt = extractelement <64 x i32> %vec, i32 7
+  ret i32 %elt
+}
+
+define i32 @v_extract_v64i32_32(<64 x i32> addrspace(1)* %ptr) {
+; GPRIDX-LABEL: v_extract_v64i32_32:
+; GPRIDX:       ; %bb.0:
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_movk_i32 s4, 0x80
+; GPRIDX-NEXT:    s_mov_b32 s5, 0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
+; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: v_extract_v64i32_32:
+; MOVREL:       ; %bb.0:
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_movk_i32 s4, 0x80
+; MOVREL-NEXT:    s_mov_b32 s5, 0
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
+  %elt = extractelement <64 x i32> %vec, i32 32
+  ret i32 %elt
+}
+
+define i32 @v_extract_v64i32_33(<64 x i32> addrspace(1)* %ptr) {
+; GPRIDX-LABEL: v_extract_v64i32_33:
+; GPRIDX:       ; %bb.0:
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_movk_i32 s4, 0x80
+; GPRIDX-NEXT:    s_mov_b32 s5, 0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
+; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: v_extract_v64i32_33:
+; MOVREL:       ; %bb.0:
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_movk_i32 s4, 0x80
+; MOVREL-NEXT:    s_mov_b32 s5, 0
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
+  %elt = extractelement <64 x i32> %vec, i32 33
+  ret i32 %elt
+}
+
+define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
+; GPRIDX-LABEL: v_extract_v64i32_37:
+; GPRIDX:       ; %bb.0:
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_movk_i32 s4, 0x80
+; GPRIDX-NEXT:    s_mov_b32 s5, 0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
+; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v5
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: v_extract_v64i32_37:
+; MOVREL:       ; %bb.0:
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_movk_i32 s4, 0x80
+; MOVREL-NEXT:    s_mov_b32 s5, 0
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v5
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
+  %elt = extractelement <64 x i32> %vec, i32 37
+  ret i32 %elt
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
index b548ff550343..f3c82289c239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
@@ -1408,208 +1408,286 @@ body: |
     ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 192
     ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4)
+    ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](<16 x s32>), 224
+    ; CHECK: S_ENDPGM 0, implicit [[EXTRACT]](s32)
+    %0:_(p1) = COPY $sgpr0_sgpr1
+    %1:_(s32) = G_CONSTANT i32 7
+    %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
+    %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1
+    S_ENDPGM 0, implicit %3
+...
+
+---
+name: extract_vector_elt_33_v64s32
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_33_v64s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4)
+    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 192
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4)
+    ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](<16 x s32>), 32
+    ; CHECK: S_ENDPGM 0, implicit [[EXTRACT]](s32)
+    %0:_(p1) = COPY $sgpr0_sgpr1
+    %1:_(s32) = G_CONSTANT i32 33
+    %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
+    %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1
+    S_ENDPGM 0, implicit %3
+...
+
+# Test handling of out of bounds indexes
+---
+name: extract_vector_elt_64_65_v64s32
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_64_65_v64s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+    ; CHECK: S_ENDPGM 0, implicit [[COPY1]](s32), implicit [[DEF]](s32)
+    %0:_(p1) = COPY $sgpr0_sgpr1
+    %1:_(s32) = G_CONSTANT i32 64
+    %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
+    %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_CONSTANT i32 65
+    %5:_(s32) = G_EXTRACT_VECTOR_ELT %2, %4
+    S_ENDPGM 0, implicit %3, implicit %5
+...
+
+---
+name: extract_vector_elt_33_v64p3
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_33_v64p3
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4)
+    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 192
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x p3>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4)
     ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>)
-    ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
-    ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD2]](<16 x s32>)
-    ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD3]](<16 x s32>)
-    ; CHECK: G_STORE [[UV]](s32), [[FRAME_INDEX]](p5) :: (store 4 into %stack.0, align 256, addrspace 5)
+    ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3), [[UV2:%[0-9]+]]:_(p3), [[UV3:%[0-9]+]]:_(p3), [[UV4:%[0-9]+]]:_(p3), [[UV5:%[0-9]+]]:_(p3), [[UV6:%[0-9]+]]:_(p3), [[UV7:%[0-9]+]]:_(p3), [[UV8:%[0-9]+]]:_(p3), [[UV9:%[0-9]+]]:_(p3), [[UV10:%[0-9]+]]:_(p3), [[UV11:%[0-9]+]]:_(p3), [[UV12:%[0-9]+]]:_(p3), [[UV13:%[0-9]+]]:_(p3), [[UV14:%[0-9]+]]:_(p3), [[UV15:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD]](<16 x p3>)
+    ; CHECK: [[UV16:%[0-9]+]]:_(p3), [[UV17:%[0-9]+]]:_(p3), [[UV18:%[0-9]+]]:_(p3), [[UV19:%[0-9]+]]:_(p3), [[UV20:%[0-9]+]]:_(p3), [[UV21:%[0-9]+]]:_(p3), [[UV22:%[0-9]+]]:_(p3), [[UV23:%[0-9]+]]:_(p3), [[UV24:%[0-9]+]]:_(p3), [[UV25:%[0-9]+]]:_(p3), [[UV26:%[0-9]+]]:_(p3), [[UV27:%[0-9]+]]:_(p3), [[UV28:%[0-9]+]]:_(p3), [[UV29:%[0-9]+]]:_(p3), [[UV30:%[0-9]+]]:_(p3), [[UV31:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD1]](<16 x p3>)
+    ; CHECK: [[UV32:%[0-9]+]]:_(p3), [[UV33:%[0-9]+]]:_(p3), [[UV34:%[0-9]+]]:_(p3), [[UV35:%[0-9]+]]:_(p3), [[UV36:%[0-9]+]]:_(p3), [[UV37:%[0-9]+]]:_(p3), [[UV38:%[0-9]+]]:_(p3), [[UV39:%[0-9]+]]:_(p3), [[UV40:%[0-9]+]]:_(p3), [[UV41:%[0-9]+]]:_(p3), [[UV42:%[0-9]+]]:_(p3), [[UV43:%[0-9]+]]:_(p3), [[UV44:%[0-9]+]]:_(p3), [[UV45:%[0-9]+]]:_(p3), [[UV46:%[0-9]+]]:_(p3), [[UV47:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD2]](<16 x p3>)
+    ; CHECK: [[UV48:%[0-9]+]]:_(p3), [[UV49:%[0-9]+]]:_(p3), [[UV50:%[0-9]+]]:_(p3), [[UV51:%[0-9]+]]:_(p3), [[UV52:%[0-9]+]]:_(p3), [[UV53:%[0-9]+]]:_(p3), [[UV54:%[0-9]+]]:_(p3), [[UV55:%[0-9]+]]:_(p3), [[UV56:%[0-9]+]]:_(p3), [[UV57:%[0-9]+]]:_(p3), [[UV58:%[0-9]+]]:_(p3), [[UV59:%[0-9]+]]:_(p3), [[UV60:%[0-9]+]]:_(p3), [[UV61:%[0-9]+]]:_(p3), [[UV62:%[0-9]+]]:_(p3), [[UV63:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[LOAD3]](<16 x p3>)
+    ; CHECK: G_STORE [[UV]](p3), [[FRAME_INDEX]](p5) :: (store 4 into %stack.0, align 256, addrspace 5)
     ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C3]](s32)
-    ; CHECK: G_STORE [[UV1]](s32), [[PTR_ADD3]](p5) :: (store 4 into %stack.0 + 4, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV1]](p3), [[PTR_ADD3]](p5) :: (store 4 into %stack.0 + 4, align 256, addrspace 5)
     ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s32)
-    ; CHECK: G_STORE [[UV2]](s32), [[PTR_ADD4]](p5) :: (store 4 into %stack.0 + 8, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV2]](p3), [[PTR_ADD4]](p5) :: (store 4 into %stack.0 + 8, align 256, addrspace 5)
     ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C5]](s32)
-    ; CHECK: G_STORE [[UV3]](s32), [[PTR_ADD5]](p5) :: (store 4 into %stack.0 + 12, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV3]](p3), [[PTR_ADD5]](p5) :: (store 4 into %stack.0 + 12, align 256, addrspace 5)
     ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s32)
-    ; CHECK: G_STORE [[UV4]](s32), [[PTR_ADD6]](p5) :: (store 4 into %stack.0 + 16, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV4]](p3), [[PTR_ADD6]](p5) :: (store 4 into %stack.0 + 16, align 256, addrspace 5)
     ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
     ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C7]](s32)
-    ; CHECK: G_STORE [[UV5]](s32), [[PTR_ADD7]](p5) :: (store 4 into %stack.0 + 20, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV5]](p3), [[PTR_ADD7]](p5) :: (store 4 into %stack.0 + 20, align 256, addrspace 5)
     ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s32)
-    ; CHECK: G_STORE [[UV6]](s32), [[PTR_ADD8]](p5) :: (store 4 into %stack.0 + 24, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV6]](p3), [[PTR_ADD8]](p5) :: (store 4 into %stack.0 + 24, align 256, addrspace 5)
     ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
     ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C9]](s32)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p5) = COPY [[PTR_ADD9]](p5)
-    ; CHECK: G_STORE [[UV7]](s32), [[COPY1]](p5) :: (store 4 into %stack.0 + 28, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV7]](p3), [[PTR_ADD9]](p5) :: (store 4 into %stack.0 + 28, align 256, addrspace 5)
     ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
     ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s32)
-    ; CHECK: G_STORE [[UV8]](s32), [[PTR_ADD10]](p5) :: (store 4 into %stack.0 + 32, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV8]](p3), [[PTR_ADD10]](p5) :: (store 4 into %stack.0 + 32, align 256, addrspace 5)
     ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 36
     ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C11]](s32)
-    ; CHECK: G_STORE [[UV9]](s32), [[PTR_ADD11]](p5) :: (store 4 into %stack.0 + 36, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV9]](p3), [[PTR_ADD11]](p5) :: (store 4 into %stack.0 + 36, align 256, addrspace 5)
     ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 40
     ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C12]](s32)
-    ; CHECK: G_STORE [[UV10]](s32), [[PTR_ADD12]](p5) :: (store 4 into %stack.0 + 40, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV10]](p3), [[PTR_ADD12]](p5) :: (store 4 into %stack.0 + 40, align 256, addrspace 5)
     ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 44
     ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C13]](s32)
-    ; CHECK: G_STORE [[UV11]](s32), [[PTR_ADD13]](p5) :: (store 4 into %stack.0 + 44, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV11]](p3), [[PTR_ADD13]](p5) :: (store 4 into %stack.0 + 44, align 256, addrspace 5)
     ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C14]](s32)
-    ; CHECK: G_STORE [[UV12]](s32), [[PTR_ADD14]](p5) :: (store 4 into %stack.0 + 48, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV12]](p3), [[PTR_ADD14]](p5) :: (store 4 into %stack.0 + 48, align 256, addrspace 5)
     ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 52
     ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C15]](s32)
-    ; CHECK: G_STORE [[UV13]](s32), [[PTR_ADD15]](p5) :: (store 4 into %stack.0 + 52, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV13]](p3), [[PTR_ADD15]](p5) :: (store 4 into %stack.0 + 52, align 256, addrspace 5)
     ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 56
     ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C16]](s32)
-    ; CHECK: G_STORE [[UV14]](s32), [[PTR_ADD16]](p5) :: (store 4 into %stack.0 + 56, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV14]](p3), [[PTR_ADD16]](p5) :: (store 4 into %stack.0 + 56, align 256, addrspace 5)
     ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 60
     ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C17]](s32)
-    ; CHECK: G_STORE [[UV15]](s32), [[PTR_ADD17]](p5) :: (store 4 into %stack.0 + 60, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV15]](p3), [[PTR_ADD17]](p5) :: (store 4 into %stack.0 + 60, align 256, addrspace 5)
     ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; CHECK: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C18]](s32)
-    ; CHECK: G_STORE [[UV16]](s32), [[PTR_ADD18]](p5) :: (store 4 into %stack.0 + 64, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV16]](p3), [[PTR_ADD18]](p5) :: (store 4 into %stack.0 + 64, align 256, addrspace 5)
     ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 68
     ; CHECK: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C19]](s32)
-    ; CHECK: G_STORE [[UV17]](s32), [[PTR_ADD19]](p5) :: (store 4 into %stack.0 + 68, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV17]](p3), [[PTR_ADD19]](p5) :: (store 4 into %stack.0 + 68, align 256, addrspace 5)
     ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 72
     ; CHECK: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C20]](s32)
-    ; CHECK: G_STORE [[UV18]](s32), [[PTR_ADD20]](p5) :: (store 4 into %stack.0 + 72, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV18]](p3), [[PTR_ADD20]](p5) :: (store 4 into %stack.0 + 72, align 256, addrspace 5)
     ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 76
     ; CHECK: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C21]](s32)
-    ; CHECK: G_STORE [[UV19]](s32), [[PTR_ADD21]](p5) :: (store 4 into %stack.0 + 76, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV19]](p3), [[PTR_ADD21]](p5) :: (store 4 into %stack.0 + 76, align 256, addrspace 5)
     ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 80
     ; CHECK: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C22]](s32)
-    ; CHECK: G_STORE [[UV20]](s32), [[PTR_ADD22]](p5) :: (store 4 into %stack.0 + 80, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV20]](p3), [[PTR_ADD22]](p5) :: (store 4 into %stack.0 + 80, align 256, addrspace 5)
     ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 84
     ; CHECK: [[PTR_ADD23:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C23]](s32)
-    ; CHECK: G_STORE [[UV21]](s32), [[PTR_ADD23]](p5) :: (store 4 into %stack.0 + 84, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV21]](p3), [[PTR_ADD23]](p5) :: (store 4 into %stack.0 + 84, align 256, addrspace 5)
     ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 88
     ; CHECK: [[PTR_ADD24:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C24]](s32)
-    ; CHECK: G_STORE [[UV22]](s32), [[PTR_ADD24]](p5) :: (store 4 into %stack.0 + 88, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV22]](p3), [[PTR_ADD24]](p5) :: (store 4 into %stack.0 + 88, align 256, addrspace 5)
     ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 92
     ; CHECK: [[PTR_ADD25:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C25]](s32)
-    ; CHECK: G_STORE [[UV23]](s32), [[PTR_ADD25]](p5) :: (store 4 into %stack.0 + 92, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV23]](p3), [[PTR_ADD25]](p5) :: (store 4 into %stack.0 + 92, align 256, addrspace 5)
     ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 96
     ; CHECK: [[PTR_ADD26:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C26]](s32)
-    ; CHECK: G_STORE [[UV24]](s32), [[PTR_ADD26]](p5) :: (store 4 into %stack.0 + 96, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV24]](p3), [[PTR_ADD26]](p5) :: (store 4 into %stack.0 + 96, align 256, addrspace 5)
     ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
     ; CHECK: [[PTR_ADD27:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C27]](s32)
-    ; CHECK: G_STORE [[UV25]](s32), [[PTR_ADD27]](p5) :: (store 4 into %stack.0 + 100, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV25]](p3), [[PTR_ADD27]](p5) :: (store 4 into %stack.0 + 100, align 256, addrspace 5)
     ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 104
     ; CHECK: [[PTR_ADD28:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C28]](s32)
-    ; CHECK: G_STORE [[UV26]](s32), [[PTR_ADD28]](p5) :: (store 4 into %stack.0 + 104, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV26]](p3), [[PTR_ADD28]](p5) :: (store 4 into %stack.0 + 104, align 256, addrspace 5)
     ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 108
     ; CHECK: [[PTR_ADD29:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C29]](s32)
-    ; CHECK: G_STORE [[UV27]](s32), [[PTR_ADD29]](p5) :: (store 4 into %stack.0 + 108, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV27]](p3), [[PTR_ADD29]](p5) :: (store 4 into %stack.0 + 108, align 256, addrspace 5)
     ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 112
     ; CHECK: [[PTR_ADD30:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C30]](s32)
-    ; CHECK: G_STORE [[UV28]](s32), [[PTR_ADD30]](p5) :: (store 4 into %stack.0 + 112, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV28]](p3), [[PTR_ADD30]](p5) :: (store 4 into %stack.0 + 112, align 256, addrspace 5)
     ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 116
     ; CHECK: [[PTR_ADD31:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C31]](s32)
-    ; CHECK: G_STORE [[UV29]](s32), [[PTR_ADD31]](p5) :: (store 4 into %stack.0 + 116, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV29]](p3), [[PTR_ADD31]](p5) :: (store 4 into %stack.0 + 116, align 256, addrspace 5)
     ; CHECK: [[C32:%[0-9]+]]:_(s32) = G_CONSTANT i32 120
     ; CHECK: [[PTR_ADD32:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C32]](s32)
-    ; CHECK: G_STORE [[UV30]](s32), [[PTR_ADD32]](p5) :: (store 4 into %stack.0 + 120, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV30]](p3), [[PTR_ADD32]](p5) :: (store 4 into %stack.0 + 120, align 256, addrspace 5)
     ; CHECK: [[C33:%[0-9]+]]:_(s32) = G_CONSTANT i32 124
     ; CHECK: [[PTR_ADD33:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C33]](s32)
-    ; CHECK: G_STORE [[UV31]](s32), [[PTR_ADD33]](p5) :: (store 4 into %stack.0 + 124, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV31]](p3), [[PTR_ADD33]](p5) :: (store 4 into %stack.0 + 124, align 256, addrspace 5)
     ; CHECK: [[C34:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; CHECK: [[PTR_ADD34:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C34]](s32)
-    ; CHECK: G_STORE [[UV32]](s32), [[PTR_ADD34]](p5) :: (store 4 into %stack.0 + 128, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV32]](p3), [[PTR_ADD34]](p5) :: (store 4 into %stack.0 + 128, align 256, addrspace 5)
     ; CHECK: [[C35:%[0-9]+]]:_(s32) = G_CONSTANT i32 132
     ; CHECK: [[PTR_ADD35:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C35]](s32)
-    ; CHECK: G_STORE [[UV33]](s32), [[PTR_ADD35]](p5) :: (store 4 into %stack.0 + 132, align 256, addrspace 5)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p5) = COPY [[PTR_ADD35]](p5)
+    ; CHECK: G_STORE [[UV33]](p3), [[COPY1]](p5) :: (store 4 into %stack.0 + 132, align 256, addrspace 5)
     ; CHECK: [[C36:%[0-9]+]]:_(s32) = G_CONSTANT i32 136
     ; CHECK: [[PTR_ADD36:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C36]](s32)
-    ; CHECK: G_STORE [[UV34]](s32), [[PTR_ADD36]](p5) :: (store 4 into %stack.0 + 136, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV34]](p3), [[PTR_ADD36]](p5) :: (store 4 into %stack.0 + 136, align 256, addrspace 5)
     ; CHECK: [[C37:%[0-9]+]]:_(s32) = G_CONSTANT i32 140
     ; CHECK: [[PTR_ADD37:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C37]](s32)
-    ; CHECK: G_STORE [[UV35]](s32), [[PTR_ADD37]](p5) :: (store 4 into %stack.0 + 140, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV35]](p3), [[PTR_ADD37]](p5) :: (store 4 into %stack.0 + 140, align 256, addrspace 5)
     ; CHECK: [[C38:%[0-9]+]]:_(s32) = G_CONSTANT i32 144
     ; CHECK: [[PTR_ADD38:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C38]](s32)
-    ; CHECK: G_STORE [[UV36]](s32), [[PTR_ADD38]](p5) :: (store 4 into %stack.0 + 144, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV36]](p3), [[PTR_ADD38]](p5) :: (store 4 into %stack.0 + 144, align 256, addrspace 5)
     ; CHECK: [[C39:%[0-9]+]]:_(s32) = G_CONSTANT i32 148
     ; CHECK: [[PTR_ADD39:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C39]](s32)
-    ; CHECK: G_STORE [[UV37]](s32), [[PTR_ADD39]](p5) :: (store 4 into %stack.0 + 148, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV37]](p3), [[PTR_ADD39]](p5) :: (store 4 into %stack.0 + 148, align 256, addrspace 5)
     ; CHECK: [[C40:%[0-9]+]]:_(s32) = G_CONSTANT i32 152
     ; CHECK: [[PTR_ADD40:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C40]](s32)
-    ; CHECK: G_STORE [[UV38]](s32), [[PTR_ADD40]](p5) :: (store 4 into %stack.0 + 152, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV38]](p3), [[PTR_ADD40]](p5) :: (store 4 into %stack.0 + 152, align 256, addrspace 5)
     ; CHECK: [[C41:%[0-9]+]]:_(s32) = G_CONSTANT i32 156
     ; CHECK: [[PTR_ADD41:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C41]](s32)
-    ; CHECK: G_STORE [[UV39]](s32), [[PTR_ADD41]](p5) :: (store 4 into %stack.0 + 156, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV39]](p3), [[PTR_ADD41]](p5) :: (store 4 into %stack.0 + 156, align 256, addrspace 5)
     ; CHECK: [[C42:%[0-9]+]]:_(s32) = G_CONSTANT i32 160
     ; CHECK: [[PTR_ADD42:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C42]](s32)
-    ; CHECK: G_STORE [[UV40]](s32), [[PTR_ADD42]](p5) :: (store 4 into %stack.0 + 160, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV40]](p3), [[PTR_ADD42]](p5) :: (store 4 into %stack.0 + 160, align 256, addrspace 5)
     ; CHECK: [[C43:%[0-9]+]]:_(s32) = G_CONSTANT i32 164
     ; CHECK: [[PTR_ADD43:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C43]](s32)
-    ; CHECK: G_STORE [[UV41]](s32), [[PTR_ADD43]](p5) :: (store 4 into %stack.0 + 164, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV41]](p3), [[PTR_ADD43]](p5) :: (store 4 into %stack.0 + 164, align 256, addrspace 5)
     ; CHECK: [[C44:%[0-9]+]]:_(s32) = G_CONSTANT i32 168
     ; CHECK: [[PTR_ADD44:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C44]](s32)
-    ; CHECK: G_STORE [[UV42]](s32), [[PTR_ADD44]](p5) :: (store 4 into %stack.0 + 168, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV42]](p3), [[PTR_ADD44]](p5) :: (store 4 into %stack.0 + 168, align 256, addrspace 5)
     ; CHECK: [[C45:%[0-9]+]]:_(s32) = G_CONSTANT i32 172
     ; CHECK: [[PTR_ADD45:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C45]](s32)
-    ; CHECK: G_STORE [[UV43]](s32), [[PTR_ADD45]](p5) :: (store 4 into %stack.0 + 172, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV43]](p3), [[PTR_ADD45]](p5) :: (store 4 into %stack.0 + 172, align 256, addrspace 5)
     ; CHECK: [[C46:%[0-9]+]]:_(s32) = G_CONSTANT i32 176
     ; CHECK: [[PTR_ADD46:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C46]](s32)
-    ; CHECK: G_STORE [[UV44]](s32), [[PTR_ADD46]](p5) :: (store 4 into %stack.0 + 176, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV44]](p3), [[PTR_ADD46]](p5) :: (store 4 into %stack.0 + 176, align 256, addrspace 5)
     ; CHECK: [[C47:%[0-9]+]]:_(s32) = G_CONSTANT i32 180
     ; CHECK: [[PTR_ADD47:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C47]](s32)
-    ; CHECK: G_STORE [[UV45]](s32), [[PTR_ADD47]](p5) :: (store 4 into %stack.0 + 180, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV45]](p3), [[PTR_ADD47]](p5) :: (store 4 into %stack.0 + 180, align 256, addrspace 5)
     ; CHECK: [[C48:%[0-9]+]]:_(s32) = G_CONSTANT i32 184
     ; CHECK: [[PTR_ADD48:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C48]](s32)
-    ; CHECK: G_STORE [[UV46]](s32), [[PTR_ADD48]](p5) :: (store 4 into %stack.0 + 184, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV46]](p3), [[PTR_ADD48]](p5) :: (store 4 into %stack.0 + 184, align 256, addrspace 5)
     ; CHECK: [[C49:%[0-9]+]]:_(s32) = G_CONSTANT i32 188
     ; CHECK: [[PTR_ADD49:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C49]](s32)
-    ; CHECK: G_STORE [[UV47]](s32), [[PTR_ADD49]](p5) :: (store 4 into %stack.0 + 188, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV47]](p3), [[PTR_ADD49]](p5) :: (store 4 into %stack.0 + 188, align 256, addrspace 5)
     ; CHECK: [[C50:%[0-9]+]]:_(s32) = G_CONSTANT i32 192
     ; CHECK: [[PTR_ADD50:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C50]](s32)
-    ; CHECK: G_STORE [[UV48]](s32), [[PTR_ADD50]](p5) :: (store 4 into %stack.0 + 192, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV48]](p3), [[PTR_ADD50]](p5) :: (store 4 into %stack.0 + 192, align 256, addrspace 5)
     ; CHECK: [[C51:%[0-9]+]]:_(s32) = G_CONSTANT i32 196
     ; CHECK: [[PTR_ADD51:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C51]](s32)
-    ; CHECK: G_STORE [[UV49]](s32), [[PTR_ADD51]](p5) :: (store 4 into %stack.0 + 196, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV49]](p3), [[PTR_ADD51]](p5) :: (store 4 into %stack.0 + 196, align 256, addrspace 5)
     ; CHECK: [[C52:%[0-9]+]]:_(s32) = G_CONSTANT i32 200
     ; CHECK: [[PTR_ADD52:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C52]](s32)
-    ; CHECK: G_STORE [[UV50]](s32), [[PTR_ADD52]](p5) :: (store 4 into %stack.0 + 200, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV50]](p3), [[PTR_ADD52]](p5) :: (store 4 into %stack.0 + 200, align 256, addrspace 5)
     ; CHECK: [[C53:%[0-9]+]]:_(s32) = G_CONSTANT i32 204
     ; CHECK: [[PTR_ADD53:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C53]](s32)
-    ; CHECK: G_STORE [[UV51]](s32), [[PTR_ADD53]](p5) :: (store 4 into %stack.0 + 204, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV51]](p3), [[PTR_ADD53]](p5) :: (store 4 into %stack.0 + 204, align 256, addrspace 5)
     ; CHECK: [[C54:%[0-9]+]]:_(s32) = G_CONSTANT i32 208
     ; CHECK: [[PTR_ADD54:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C54]](s32)
-    ; CHECK: G_STORE [[UV52]](s32), [[PTR_ADD54]](p5) :: (store 4 into %stack.0 + 208, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV52]](p3), [[PTR_ADD54]](p5) :: (store 4 into %stack.0 + 208, align 256, addrspace 5)
     ; CHECK: [[C55:%[0-9]+]]:_(s32) = G_CONSTANT i32 212
     ; CHECK: [[PTR_ADD55:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C55]](s32)
-    ; CHECK: G_STORE [[UV53]](s32), [[PTR_ADD55]](p5) :: (store 4 into %stack.0 + 212, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV53]](p3), [[PTR_ADD55]](p5) :: (store 4 into %stack.0 + 212, align 256, addrspace 5)
     ; CHECK: [[C56:%[0-9]+]]:_(s32) = G_CONSTANT i32 216
     ; CHECK: [[PTR_ADD56:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C56]](s32)
-    ; CHECK: G_STORE [[UV54]](s32), [[PTR_ADD56]](p5) :: (store 4 into %stack.0 + 216, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV54]](p3), [[PTR_ADD56]](p5) :: (store 4 into %stack.0 + 216, align 256, addrspace 5)
     ; CHECK: [[C57:%[0-9]+]]:_(s32) = G_CONSTANT i32 220
     ; CHECK: [[PTR_ADD57:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C57]](s32)
-    ; CHECK: G_STORE [[UV55]](s32), [[PTR_ADD57]](p5) :: (store 4 into %stack.0 + 220, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV55]](p3), [[PTR_ADD57]](p5) :: (store 4 into %stack.0 + 220, align 256, addrspace 5)
     ; CHECK: [[C58:%[0-9]+]]:_(s32) = G_CONSTANT i32 224
     ; CHECK: [[PTR_ADD58:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C58]](s32)
-    ; CHECK: G_STORE [[UV56]](s32), [[PTR_ADD58]](p5) :: (store 4 into %stack.0 + 224, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV56]](p3), [[PTR_ADD58]](p5) :: (store 4 into %stack.0 + 224, align 256, addrspace 5)
     ; CHECK: [[C59:%[0-9]+]]:_(s32) = G_CONSTANT i32 228
     ; CHECK: [[PTR_ADD59:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C59]](s32)
-    ; CHECK: G_STORE [[UV57]](s32), [[PTR_ADD59]](p5) :: (store 4 into %stack.0 + 228, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV57]](p3), [[PTR_ADD59]](p5) :: (store 4 into %stack.0 + 228, align 256, addrspace 5)
     ; CHECK: [[C60:%[0-9]+]]:_(s32) = G_CONSTANT i32 232
     ; CHECK: [[PTR_ADD60:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C60]](s32)
-    ; CHECK: G_STORE [[UV58]](s32), [[PTR_ADD60]](p5) :: (store 4 into %stack.0 + 232, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV58]](p3), [[PTR_ADD60]](p5) :: (store 4 into %stack.0 + 232, align 256, addrspace 5)
     ; CHECK: [[C61:%[0-9]+]]:_(s32) = G_CONSTANT i32 236
     ; CHECK: [[PTR_ADD61:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C61]](s32)
-    ; CHECK: G_STORE [[UV59]](s32), [[PTR_ADD61]](p5) :: (store 4 into %stack.0 + 236, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV59]](p3), [[PTR_ADD61]](p5) :: (store 4 into %stack.0 + 236, align 256, addrspace 5)
     ; CHECK: [[C62:%[0-9]+]]:_(s32) = G_CONSTANT i32 240
     ; CHECK: [[PTR_ADD62:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C62]](s32)
-    ; CHECK: G_STORE [[UV60]](s32), [[PTR_ADD62]](p5) :: (store 4 into %stack.0 + 240, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV60]](p3), [[PTR_ADD62]](p5) :: (store 4 into %stack.0 + 240, align 256, addrspace 5)
     ; CHECK: [[C63:%[0-9]+]]:_(s32) = G_CONSTANT i32 244
     ; CHECK: [[PTR_ADD63:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C63]](s32)
-    ; CHECK: G_STORE [[UV61]](s32), [[PTR_ADD63]](p5) :: (store 4 into %stack.0 + 244, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV61]](p3), [[PTR_ADD63]](p5) :: (store 4 into %stack.0 + 244, align 256, addrspace 5)
     ; CHECK: [[C64:%[0-9]+]]:_(s32) = G_CONSTANT i32 248
     ; CHECK: [[PTR_ADD64:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C64]](s32)
-    ; CHECK: G_STORE [[UV62]](s32), [[PTR_ADD64]](p5) :: (store 4 into %stack.0 + 248, align 256, addrspace 5)
+    ; CHECK: G_STORE [[UV62]](p3), [[PTR_ADD64]](p5) :: (store 4 into %stack.0 + 248, align 256, addrspace 5)
     ; CHECK: [[C65:%[0-9]+]]:_(s32) = G_CONSTANT i32 252
     ; CHECK: [[PTR_ADD65:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C65]](s32)
-    ; CHECK: G_STORE [[UV63]](s32), [[PTR_ADD65]](p5) :: (store 4 into %stack.0 + 252, align 256, addrspace 5)
-    ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p5) :: (load 4 from %stack.0 + 28, addrspace 5)
-    ; CHECK: S_ENDPGM 0, implicit [[LOAD4]](s32)
+    ; CHECK: G_STORE [[UV63]](p3), [[PTR_ADD65]](p5) :: (store 4 into %stack.0 + 252, align 256, addrspace 5)
+    ; CHECK: [[LOAD4:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD35]](p5) :: (load 4 from %stack.0 + 132, addrspace 5)
+    ; CHECK: S_ENDPGM 0, implicit [[LOAD4]](p3)
     %0:_(p1) = COPY $sgpr0_sgpr1
-    %1:_(s32) = G_CONSTANT i32 7
-    %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
-    %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1
+    %1:_(s32) = G_CONSTANT i32 33
+    %2:_(<64 x p3>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
+    %3:_(p3) = G_EXTRACT_VECTOR_ELT %2, %1
     S_ENDPGM 0, implicit %3
 ...
 


        


More information about the llvm-commits mailing list