[llvm] 212570a - GlobalISel: Implement bitcast action for G_EXTRACT_VECTOR_ELEMENT

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sun Aug 2 07:42:13 PDT 2020


Author: Matt Arsenault
Date: 2020-08-02T10:42:07-04:00
New Revision: 212570abcf755b8577a7aec80777503232d36d77

URL: https://github.com/llvm/llvm-project/commit/212570abcf755b8577a7aec80777503232d36d77
DIFF: https://github.com/llvm/llvm-project/commit/212570abcf755b8577a7aec80777503232d36d77.diff

LOG: GlobalISel: Implement bitcast action for G_EXTRACT_VECTOR_ELEMENT

For AMDGPU, vectors with elements < 32 bits should be indexed in
32-bit elements and the desired bits extracted from there. For
elements > 64-bits, these should be reduce to 64/32 elements to enable
the normal dynamic indexing paths.

In the dynamic index cases, this produces shorter code most of the
time. This does immediately regress the constant index cases, but this
should be fixed once we have the most basic of shift combines.

The element size > 64 case is pretty much ported from the exisiting
DAG implementation for extract element promote. The increasing element
size case is new.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index d925c53a5750..e819dca5bdf0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -307,6 +307,10 @@ class LegalizerHelper {
   LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
 
+  /// Perform Bitcast legalize action on G_EXTRACT_VECTOR_ELT.
+  LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT CastTy);
+
   LegalizeResult lowerBitcast(MachineInstr &MI);
   LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
 

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index abb983dac6bd..920c9e008012 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2330,6 +2330,122 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) {
   return UnableToLegalize;
 }
 
+/// Perform a G_EXTRACT_VECTOR_ELT in a 
diff erent sized vector element. If this
+/// is casting to a vector with a smaller element size, perform multiple element
+/// extracts and merge the results. If this is coercing to a vector with larger
+/// elements, index the bitcasted vector and extract the target element with bit
+/// operations. This is intended to force the indexing in the native register
+/// size for architectures that can dynamically index the register file.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT CastTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register Idx = MI.getOperand(2).getReg();
+  LLT SrcVecTy = MRI.getType(SrcVec);
+  LLT IdxTy = MRI.getType(Idx);
+
+  LLT SrcEltTy = SrcVecTy.getElementType();
+  unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
+  unsigned OldNumElts = SrcVecTy.getNumElements();
+
+  LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
+  Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
+
+  const unsigned NewEltSize = NewEltTy.getSizeInBits();
+  const unsigned OldEltSize = SrcEltTy.getSizeInBits();
+  if (NewNumElts > OldNumElts) {
+    // Decreasing the vector element size
+    //
+    // e.g. i64 = extract_vector_elt x:v2i64, y:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //
+    // i64 = bitcast
+    //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
+    //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
+    //
+    if (NewNumElts % OldNumElts != 0)
+      return UnableToLegalize;
+
+    // Type of the intermediate result vector.
+    const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
+    LLT MidTy = LLT::scalarOrVector(NewEltsPerOldElt, NewEltTy);
+
+    auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
+
+    SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
+    auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
+
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
+      auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
+      auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
+      NewOps[I] = Elt.getReg(0);
+    }
+
+    auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
+    MIRBuilder.buildBitcast(Dst, NewVec);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (NewNumElts < OldNumElts) {
+    if (NewEltSize % OldEltSize != 0)
+      return UnableToLegalize;
+
+    // This only depends on powers of 2 because we use bit tricks to figure out
+    // the bit offset we need to shift to get the target element. A general
+    // expansion could emit division/multiply.
+    if (!isPowerOf2_32(NewEltSize / OldEltSize))
+      return UnableToLegalize;
+
+    // Increasing the vector element size.
+    // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
+    //
+    //   =>
+    //
+    // %cast = G_BITCAST %vec
+    // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
+    // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
+    // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
+    // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
+    // %elt_bits = G_LSHR %wide_elt, %offset_bits
+    // %elt = G_TRUNC %elt_bits
+
+    const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+    auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
+
+    // Divide to get the index in the wider element type.
+    auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
+
+    Register WideElt = CastVec;
+    if (CastTy.isVector()) {
+      WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
+                                                     ScaledIdx).getReg(0);
+    }
+
+    // Now figure out the amount we need to shift to get the target bits.
+    auto OffsetMask = MIRBuilder.buildConstant(
+      IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
+    auto OffsetIdx = MIRBuilder.buildAnd(IdxTy, Idx, OffsetMask);
+    auto OffsetBits = MIRBuilder.buildShl(
+      IdxTy, OffsetIdx,
+      MIRBuilder.buildConstant(IdxTy, Log2_32(OldEltSize)));
+
+    // Shift the wide element to get the target element.
+    auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
+    MIRBuilder.buildTrunc(Dst, ExtractedBits);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   switch (MI.getOpcode()) {
@@ -2378,6 +2494,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
   default:
     return UnableToLegalize;
   }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c5d5f1675bc8..cc97e11707ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -70,6 +70,13 @@ static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
   };
 }
 
+static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    return Ty.getSizeInBits() % 32 == 0;
+  };
+}
+
 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
@@ -132,6 +139,15 @@ static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
   };
 }
 
+static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    unsigned Size = Ty.getSizeInBits();
+    assert(Size % 32 == 0);
+    return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
+  };
+}
+
 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
@@ -1279,11 +1295,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
           const LLT EltTy = Query.Types[EltTypeIdx];
           const LLT VecTy = Query.Types[VecTypeIdx];
           const LLT IdxTy = Query.Types[IdxTypeIdx];
-          return (EltTy.getSizeInBits() == 16 ||
-                  EltTy.getSizeInBits() % 32 == 0) &&
-                 VecTy.getSizeInBits() % 32 == 0 &&
-                 VecTy.getSizeInBits() <= MaxRegisterSize &&
-                 IdxTy.getSizeInBits() == 32;
+          const unsigned EltSize = EltTy.getSizeInBits();
+          return (EltSize == 32 || EltSize == 64) &&
+                  VecTy.getSizeInBits() % 32 == 0 &&
+                  VecTy.getSizeInBits() <= MaxRegisterSize &&
+                  IdxTy.getSizeInBits() == 32;
+        })
+      .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
+                 bitcastToVectorElement32(1))
+      //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
+      .bitcastIf(
+        all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
+        [=](const LegalityQuery &Query) {
+          // For > 64-bit element types, try to turn this into a 64-bit
+          // element vector since we may be able to do better indexing
+          // if this is scalar. If not, fall back to 32.
+          const LLT EltTy = Query.Types[EltTypeIdx];
+          const LLT VecTy = Query.Types[VecTypeIdx];
+          const unsigned DstEltSize = EltTy.getSizeInBits();
+          const unsigned VecSize = VecTy.getSizeInBits();
+
+          const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
+          return std::make_pair(
+            VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
         })
       .clampScalar(EltTypeIdx, S32, S64)
       .clampScalar(VecTypeIdx, S32, S64)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
new file mode 100644
index 000000000000..28c0651b10fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -0,0 +1,769 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+
+define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
+; GFX9-NEXT:    s_lshl_b32 m0, s4, 1
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_movrels_b64 s[0:1], s[8:9]
+; GFX9-NEXT:    s_movrels_b64 s[2:3], s[10:11]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
+; GFX8-NEXT:    s_lshl_b32 m0, s4, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_movrels_b64 s[0:1], s[8:9]
+; GFX8-NEXT:    s_movrels_b64 s[2:3], s[10:11]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
+; GFX7-NEXT:    s_lshl_b32 m0, s4, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_movrels_b64 s[0:1], s[8:9]
+; GFX7-NEXT:    s_movrels_b64 s[2:3], s[10:11]
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 %idx
+  ret i128 %element
+}
+
+define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(<4 x i128> addrspace(1)* %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, v3
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v18
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[6:7]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
+; GFX8-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
+; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX8-NEXT:    s_lshl_b32 m0, s0, 1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_movrels_b32_e32 v1, v3
+; GFX8-NEXT:    v_movrels_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX7-NEXT:    s_lshl_b32 m0, s0, 1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_movrels_b32_e32 v1, v3
+; GFX7-NEXT:    v_movrels_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 %idx
+  ret i128 %element
+}
+
+define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 1, v2
+; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX9-NEXT:    v_add_u32_e32 v17, 1, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v2, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v17
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v17
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v14, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[3:4]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 1, v2
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v16
+; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v2, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v3, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v17
+; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v19, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v17
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v14, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 1, v2
+; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 1, v16
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v2, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v3, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v10, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v11, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 7, v17
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v14, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 %idx
+  ret i128 %element
+}
+
+define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v8, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v10, s9
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v11, s10
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
+; GFX9-NEXT:    v_mov_b32_e32 v12, s11
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v13, s12
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX9-NEXT:    v_mov_b32_e32 v14, s13
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v14, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_mov_b32_e32 v15, s14
+; GFX9-NEXT:    v_mov_b32_e32 v16, s15
+; GFX9-NEXT:    v_add_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v16, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, s6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v10, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v11, s8
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v12, s9
+; GFX9-NEXT:    v_mov_b32_e32 v13, s10
+; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX9-NEXT:    v_mov_b32_e32 v8, s14
+; GFX9-NEXT:    v_mov_b32_e32 v9, s15
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, s8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, s9
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v11, s10
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v12, s11
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v13, s12
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v14, s13
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v14, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v15, s14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v16, s15
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v16, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_mov_b32_e32 v5, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, s6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v11, s8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v12, s9
+; GFX8-NEXT:    v_mov_b32_e32 v13, s10
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v6, s12
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s14
+; GFX8-NEXT:    v_mov_b32_e32 v9, s15
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s3
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:    v_mov_b32_e32 v6, s5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, s7
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v9, s8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v10, s9
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v11, s10
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v12, s11
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v13, s12
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX7-NEXT:    v_mov_b32_e32 v14, s13
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v14, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v15, s14
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v16, s15
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v16, vcc
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    v_mov_b32_e32 v5, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v7, s4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, s5
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v9, s6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v10, s7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v11, s8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v12, s9
+; GFX7-NEXT:    v_mov_b32_e32 v13, s10
+; GFX7-NEXT:    v_mov_b32_e32 v5, s11
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, s12
+; GFX7-NEXT:    v_mov_b32_e32 v7, s13
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, s14
+; GFX7-NEXT:    v_mov_b32_e32 v9, s15
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 %idx
+  ret i128 %element
+}
+
+define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(<4 x i128> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 0
+  ret i128 %element
+}
+
+define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(<4 x i128> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i128_idx1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_mov_b32 s2, s6
+; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 1
+  ret i128 %element
+}
+
+define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(<4 x i128> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i128_idx2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s8
+; GCN-NEXT:    s_mov_b32 s1, s9
+; GCN-NEXT:    s_mov_b32 s2, s10
+; GCN-NEXT:    s_mov_b32 s3, s11
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 2
+  ret i128 %element
+}
+
+define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(<4 x i128> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i128_idx3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s12
+; GCN-NEXT:    s_mov_b32 s1, s13
+; GCN-NEXT:    s_mov_b32 s2, s14
+; GCN-NEXT:    s_mov_b32 s3, s15
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 3
+  ret i128 %element
+}
+
+define i128 @extractelement_vgpr_v4i128_idx0(<4 x i128> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i128_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i128_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i128_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 0
+  ret i128 %element
+}
+
+define i128 @extractelement_vgpr_v4i128_idx1(<4 x i128> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i128_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i128_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v2, v6
+; GFX8-NEXT:    v_mov_b32_e32 v3, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i128_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 1
+  ret i128 %element
+}
+
+define i128 @extractelement_vgpr_v4i128_idx2(<4 x i128> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i128_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-NEXT:    v_mov_b32_e32 v2, v10
+; GFX9-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i128_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v8
+; GFX8-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-NEXT:    v_mov_b32_e32 v2, v10
+; GFX8-NEXT:    v_mov_b32_e32 v3, v11
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i128_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 2
+  ret i128 %element
+}
+
+define i128 @extractelement_vgpr_v4i128_idx3(<4 x i128> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i128_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v12
+; GFX9-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i128_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v12
+; GFX8-NEXT:    v_mov_b32_e32 v1, v13
+; GFX8-NEXT:    v_mov_b32_e32 v2, v14
+; GFX8-NEXT:    v_mov_b32_e32 v3, v15
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i128_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v12
+; GFX7-NEXT:    v_mov_b32_e32 v1, v13
+; GFX7-NEXT:    v_mov_b32_e32 v2, v14
+; GFX7-NEXT:    v_mov_b32_e32 v3, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
+  %element = extractelement <4 x i128> %vector, i32 3
+  ret i128 %element
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
new file mode 100644
index 000000000000..13d7fbeda0f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -0,0 +1,802 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+
+define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) {
+; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_lshr_b32 s2, s4, 1
+; GCN-NEXT:    s_cmp_eq_u32 s2, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cselect_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s1, s4, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 4
+; GCN-NEXT:    s_lshr_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(<4 x i16> addrspace(1)* %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v4i16_vgpr_idx(<4 x i16> addrspace(1)* %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 %idx) {
+; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(<4 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i16_idx0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 0
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(<4 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i16_idx1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 1
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(<4 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i16_idx2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 2
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(<4 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i16_idx3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s1, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 3
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v4i16_idx0(<4 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i16_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i16_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i16_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 0
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v4i16_idx1(<4 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i16_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i16_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 1
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v4i16_idx2(<4 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i16_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i16_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 2
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v4i16_idx3(<4 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i16_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i16_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %element = extractelement <4 x i16> %vector, i32 3
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) {
+; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_lshr_b32 s5, s4, 1
+; GCN-NEXT:    s_cmp_eq_u32 s5, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cselect_b32 s0, s1, s0
+; GCN-NEXT:    s_cmp_eq_u32 s5, 2
+; GCN-NEXT:    s_cselect_b32 s0, s2, s0
+; GCN-NEXT:    s_cmp_eq_u32 s5, 3
+; GCN-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-NEXT:    s_and_b32 s1, s4, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 4
+; GCN-NEXT:    s_lshr_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    s_lshl_b32 s0, s1, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 %idx) {
+; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 %idx
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 0
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 1
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 2
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s1, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 3
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 4
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx5:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s2, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 5
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s3
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 6
+  ret i16 %element
+}
+
+define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(<8 x i16> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i16_idx7:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s3, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 7
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx0(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 0
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx1(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 1
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx2(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 2
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx3(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 3
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx4(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 4
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx5(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 5
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx6(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx6:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 6
+  ret i16 %element
+}
+
+define i16 @extractelement_vgpr_v8i16_idx7(<8 x i16> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i16_idx7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i16_idx7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i16_idx7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %element = extractelement <8 x i16> %vector, i32 7
+  ret i16 %element
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
new file mode 100644
index 000000000000..95b4177abbca
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -0,0 +1,3135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+
+define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) {
+; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s5, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 8
+; GCN-NEXT:    s_and_b32 s1, s1, s5
+; GCN-NEXT:    s_lshr_b32 s2, s0, 16
+; GCN-NEXT:    s_lshr_b32 s3, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s5
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s2, s5
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s5
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s4, 3
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_lshr_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    s_and_b32 s2, s2, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    s_and_b32 s1, s2, 3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v1, 3, v2
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v2
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX9-NEXT:    s_and_b32 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s4
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s2, s4
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s3, s4
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s4
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s3, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX7-NEXT:    s_and_b32 s1, s1, s4
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, s4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s3, s4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    v_lshr_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(<4 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i8_idx0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s0, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s1, 8
+; GCN-NEXT:    s_and_b32 s2, s2, s0
+; GCN-NEXT:    s_lshr_b32 s3, s1, 16
+; GCN-NEXT:    s_lshr_b32 s4, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s3, s0
+; GCN-NEXT:    s_and_b32 s0, s4, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 0
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(<4 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i8_idx1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s0, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s1, 8
+; GCN-NEXT:    s_and_b32 s2, s2, s0
+; GCN-NEXT:    s_lshr_b32 s3, s1, 16
+; GCN-NEXT:    s_lshr_b32 s4, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s3, s0
+; GCN-NEXT:    s_and_b32 s0, s4, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshr_b32 s0, s0, 8
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 1
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(<4 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i8_idx2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s0, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s1, 8
+; GCN-NEXT:    s_and_b32 s2, s2, s0
+; GCN-NEXT:    s_lshr_b32 s3, s1, 16
+; GCN-NEXT:    s_lshr_b32 s4, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s3, s0
+; GCN-NEXT:    s_and_b32 s0, s4, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 2
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(<4 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v4i8_idx3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s0, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s1, 8
+; GCN-NEXT:    s_and_b32 s2, s2, s0
+; GCN-NEXT:    s_lshr_b32 s3, s1, 16
+; GCN-NEXT:    s_lshr_b32 s4, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s3, s0
+; GCN-NEXT:    s_and_b32 s0, s4, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshr_b32 s0, s0, 24
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 3
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i8_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i8_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i8_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 0
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i8_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i8_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i8_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 1
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i8_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i8_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i8_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 2
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v4i8_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v4i8_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v4i8_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %element = extractelement <4 x i8> %vector, i32 3
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) {
+; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s9, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s0, 8
+; GCN-NEXT:    s_and_b32 s2, s2, s9
+; GCN-NEXT:    s_lshr_b32 s3, s0, 16
+; GCN-NEXT:    s_lshr_b32 s5, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s9
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s3, s9
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s5, s9
+; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_lshr_b32 s6, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s6, s9
+; GCN-NEXT:    s_lshr_b32 s7, s1, 16
+; GCN-NEXT:    s_lshr_b32 s8, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s9
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s7, s9
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s8, s9
+; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_lshr_b32 s2, s4, 2
+; GCN-NEXT:    s_cmp_eq_u32 s2, 1
+; GCN-NEXT:    s_cselect_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s1, s4, 3
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_lshr_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 2
+; GFX9-NEXT:    s_and_b32 s2, s2, 3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v3
+; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    s_lshr_b32 s1, s2, 2
+; GFX7-NEXT:    s_and_b32 s2, s2, 3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    s_lshl_b32 s0, s2, 3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v5
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v3
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* inreg %ptr, i32 %idx) {
+; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s8, 0xff
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s0, 8
+; GCN-NEXT:    s_and_b32 s2, s2, s8
+; GCN-NEXT:    s_lshr_b32 s3, s0, 16
+; GCN-NEXT:    s_lshr_b32 s4, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s3, s8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s4, s8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_lshr_b32 s5, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s2, s5, s8
+; GCN-NEXT:    s_lshr_b32 s6, s1, 16
+; GCN-NEXT:    s_lshr_b32 s7, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s6, s8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s2, s7, s8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 8
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshr_b32 s2, s0, 16
+; GCN-NEXT:    s_lshr_b32 s3, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 0
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 8
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshr_b32 s2, s0, 16
+; GCN-NEXT:    s_lshr_b32 s3, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_lshr_b32 s0, s0, 8
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 1
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 8
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshr_b32 s2, s0, 16
+; GCN-NEXT:    s_lshr_b32 s3, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 2
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 8
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshr_b32 s2, s0, 16
+; GCN-NEXT:    s_lshr_b32 s3, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_lshr_b32 s0, s0, 24
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 3
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s1, 8
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshr_b32 s2, s1, 16
+; GCN-NEXT:    s_lshr_b32 s3, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshl_b32 s0, s0, 8
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 4
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx5:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s1, 8
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshr_b32 s2, s1, 16
+; GCN-NEXT:    s_lshr_b32 s3, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshl_b32 s0, s0, 8
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_lshr_b32 s0, s0, 8
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 5
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s1, 8
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshr_b32 s2, s1, 16
+; GCN-NEXT:    s_lshr_b32 s3, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshl_b32 s0, s0, 8
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 6
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(<8 x i8> addrspace(4)* inreg %ptr) {
+; GCN-LABEL: extractelement_sgpr_v8i8_idx7:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s4, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s0, s1, 8
+; GCN-NEXT:    s_and_b32 s0, s0, s4
+; GCN-NEXT:    s_lshr_b32 s2, s1, 16
+; GCN-NEXT:    s_lshr_b32 s3, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s4
+; GCN-NEXT:    s_lshl_b32 s0, s0, 8
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s1, s2, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s3, s4
+; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_lshr_b32 s0, s0, 24
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 7
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 0
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 1
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 2
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 3
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 4
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 5
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx6:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 6
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v8i8_idx7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v8i8_idx7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v8i8_idx7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %element = extractelement <8 x i8> %vector, i32 7
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) {
+; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s17, 0xff
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s5, s0, 8
+; GCN-NEXT:    s_and_b32 s5, s5, s17
+; GCN-NEXT:    s_lshr_b32 s6, s0, 16
+; GCN-NEXT:    s_lshr_b32 s7, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s5
+; GCN-NEXT:    s_and_b32 s5, s6, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s5
+; GCN-NEXT:    s_and_b32 s5, s7, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshr_b32 s8, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s5
+; GCN-NEXT:    s_and_b32 s5, s8, s17
+; GCN-NEXT:    s_lshr_b32 s9, s1, 16
+; GCN-NEXT:    s_lshr_b32 s10, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s5
+; GCN-NEXT:    s_and_b32 s5, s9, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s5
+; GCN-NEXT:    s_and_b32 s5, s10, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshr_b32 s11, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s5
+; GCN-NEXT:    s_and_b32 s5, s11, s17
+; GCN-NEXT:    s_lshr_b32 s12, s2, 16
+; GCN-NEXT:    s_lshr_b32 s13, s2, 24
+; GCN-NEXT:    s_and_b32 s2, s2, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_or_b32 s2, s2, s5
+; GCN-NEXT:    s_and_b32 s5, s12, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 16
+; GCN-NEXT:    s_or_b32 s2, s2, s5
+; GCN-NEXT:    s_and_b32 s5, s13, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshr_b32 s14, s3, 8
+; GCN-NEXT:    s_or_b32 s2, s2, s5
+; GCN-NEXT:    s_and_b32 s5, s14, s17
+; GCN-NEXT:    s_lshr_b32 s15, s3, 16
+; GCN-NEXT:    s_lshr_b32 s16, s3, 24
+; GCN-NEXT:    s_and_b32 s3, s3, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_or_b32 s3, s3, s5
+; GCN-NEXT:    s_and_b32 s5, s15, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 16
+; GCN-NEXT:    s_or_b32 s3, s3, s5
+; GCN-NEXT:    s_and_b32 s5, s16, s17
+; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_or_b32 s3, s3, s5
+; GCN-NEXT:    s_lshr_b32 s5, s4, 2
+; GCN-NEXT:    s_cmp_eq_u32 s5, 1
+; GCN-NEXT:    s_cselect_b32 s0, s1, s0
+; GCN-NEXT:    s_cmp_eq_u32 s5, 2
+; GCN-NEXT:    s_cselect_b32 s0, s2, s0
+; GCN-NEXT:    s_cmp_eq_u32 s5, 3
+; GCN-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-NEXT:    s_and_b32 s1, s4, 3
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_lshr_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 inreg %idx) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
+; GFX9-NEXT:    s_and_b32 s2, s2, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v6
+; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v7
+; GFX9-NEXT:    v_and_b32_sdwa v14, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s1, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v11
+; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_and_b32_sdwa v16, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v4, v5
+; GFX9-NEXT:    v_or3_b32 v2, v2, v14, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_or3_b32 v3, v3, v16, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX8-NEXT:    v_and_b32_sdwa v15, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX8-NEXT:    v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX7-NEXT:    s_lshr_b32 s1, s2, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_and_b32 s2, s2, 3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX7-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX7-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX7-NEXT:    v_and_b32_e32 v11, v11, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX7-NEXT:    v_and_b32_e32 v10, s0, v10
+; GFX7-NEXT:    v_and_b32_e32 v12, v12, v4
+; GFX7-NEXT:    v_and_b32_e32 v14, v14, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_and_b32_e32 v13, v13, v4
+; GFX7-NEXT:    v_and_b32_e32 v15, v15, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v11
+; GFX7-NEXT:    v_and_b32_e32 v4, v16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v13
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    s_lshl_b32 s0, s2, 3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 %idx) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s5, v8
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s5, v9
+; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v6, v0, v1
+; GFX9-NEXT:    v_or3_b32 v1, v3, v12, v13
+; GFX9-NEXT:    v_or3_b32 v3, v4, v14, v15
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s5, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_or3_b32 v4, v5, v16, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_or3_b32 v0, v0, v18, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v8, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 2, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v14, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX8-NEXT:    v_and_b32_sdwa v8, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX8-NEXT:    v_and_b32_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v18, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_sdwa v0, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v19
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v18
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 2, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX7-NEXT:    v_and_b32_e32 v12, v12, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 24, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
+; GFX7-NEXT:    v_and_b32_e32 v13, v13, v0
+; GFX7-NEXT:    v_and_b32_e32 v15, v15, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_and_b32_e32 v14, v14, v0
+; GFX7-NEXT:    v_and_b32_e32 v16, v16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, v6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX7-NEXT:    v_and_b32_e32 v0, v17, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v15
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v16
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v14
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v18
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v18
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* inreg %ptr, i32 %idx) {
+; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT:    s_movk_i32 s16, 0xff
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s4, s0, 8
+; GCN-NEXT:    s_and_b32 s4, s4, s16
+; GCN-NEXT:    s_lshr_b32 s5, s0, 16
+; GCN-NEXT:    s_lshr_b32 s6, s0, 24
+; GCN-NEXT:    s_and_b32 s0, s0, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_and_b32 s4, s5, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_and_b32 s4, s6, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshr_b32 s7, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_and_b32 s4, s7, s16
+; GCN-NEXT:    s_lshr_b32 s8, s1, 16
+; GCN-NEXT:    s_lshr_b32 s9, s1, 24
+; GCN-NEXT:    s_and_b32 s1, s1, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s4
+; GCN-NEXT:    s_and_b32 s4, s8, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 16
+; GCN-NEXT:    s_or_b32 s1, s1, s4
+; GCN-NEXT:    s_and_b32 s4, s9, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshr_b32 s10, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s4
+; GCN-NEXT:    s_and_b32 s4, s10, s16
+; GCN-NEXT:    s_lshr_b32 s11, s2, 16
+; GCN-NEXT:    s_lshr_b32 s12, s2, 24
+; GCN-NEXT:    s_and_b32 s2, s2, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-NEXT:    s_or_b32 s2, s2, s4
+; GCN-NEXT:    s_and_b32 s4, s11, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 16
+; GCN-NEXT:    s_or_b32 s2, s2, s4
+; GCN-NEXT:    s_and_b32 s4, s12, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshr_b32 s13, s3, 8
+; GCN-NEXT:    s_or_b32 s2, s2, s4
+; GCN-NEXT:    s_and_b32 s4, s13, s16
+; GCN-NEXT:    s_lshr_b32 s14, s3, 16
+; GCN-NEXT:    s_lshr_b32 s15, s3, 24
+; GCN-NEXT:    s_and_b32 s3, s3, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-NEXT:    s_or_b32 s3, s3, s4
+; GCN-NEXT:    s_and_b32 s4, s14, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 16
+; GCN-NEXT:    s_or_b32 s3, s3, s4
+; GCN-NEXT:    s_and_b32 s4, s15, s16
+; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    s_or_b32 s3, s3, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 %idx
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 0
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 1
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 2
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 3
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 4
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 5
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx6:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 6
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 7
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 8
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx9:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 9
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx10:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx10:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 10
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx11:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx11:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 11
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx12:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx12:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 12
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx13:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx13:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx13:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 13
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx14:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx14:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx14:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 14
+  ret i8 %element
+}
+
+define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) {
+; GFX9-LABEL: extractelement_vgpr_v16i8_idx15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: extractelement_vgpr_v16i8_idx15:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: extractelement_vgpr_v16i8_idx15:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %element = extractelement <16 x i8> %vector, i32 15
+  ret i8 %element
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
index 1d3311673cf6..b548ff550343 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
 
 ---
 name: extract_vector_elt_0_v2i32
@@ -228,8 +228,10 @@ body: |
 
     ; CHECK-LABEL: name: extract_vector_elt_0_v2i16_i32
     ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>)
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
     ; CHECK: $vgpr0 = COPY [[COPY]](s32)
     %0:_(<2 x s16>) = G_IMPLICIT_DEF
     %1:_(s32) = G_CONSTANT i32 0
@@ -290,25 +292,155 @@ name: extract_vector_elt_v2s8_varidx_i32
 
 body: |
   bb.0:
-    liveins: $vgpr0_vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1
 
     ; CHECK-LABEL: name: extract_vector_elt_v2s8_varidx_i32
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[COPY]](<2 x s32>)
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
-    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV]], 8
-    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 8
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8
     ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[COPY1]](s32)
-    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
-    ; CHECK: $vgpr0 = COPY [[COPY3]](s32)
-    %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    %1:_(s32) = COPY $vgpr2
-    %2:_(<2 x s8>) = G_TRUNC %0
-    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
-    %4:_(s32) = G_ANYEXT %3
-    $vgpr0 = COPY %4
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY6]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(<2 x s8>) = G_BITCAST %2
+    %4:_(s8) = G_EXTRACT_VECTOR_ELT %3, %1
+    %5:_(s32) = G_ANYEXT %4
+    $vgpr0 = COPY %5
+...
+
+---
+name: extract_vector_elt_v2s8_constidx_0_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: extract_vector_elt_v2s8_constidx_0_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
+    ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BUILD_VECTOR]](<2 x s32>), 0
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY6]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(<2 x s8>) = G_BITCAST %2
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:_(s8) = G_EXTRACT_VECTOR_ELT %3, %4
+    %6:_(s32) = G_ANYEXT %5
+    $vgpr0 = COPY %6
+...
+
+---
+name: extract_vector_elt_v2s8_constidx_1_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: extract_vector_elt_v2s8_constidx_1_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
+    ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BUILD_VECTOR]](<2 x s32>), 32
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY6]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(<2 x s8>) = G_BITCAST %2
+    %4:_(s32) = G_CONSTANT i32 1
+    %5:_(s8) = G_EXTRACT_VECTOR_ELT %3, %4
+    %6:_(s32) = G_ANYEXT %5
+    $vgpr0 = COPY %6
+...
+
+---
+name: extract_vector_elt_v4s4_varidx_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v4s4_varidx_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C4]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32)
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C6]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 4
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 4
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32)
+    ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY8]], 4
+    ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32)
+    ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY9]], 4
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32)
+    ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY10]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(<4 x s4>) = G_BITCAST %2
+    %4:_(s4) = G_EXTRACT_VECTOR_ELT %3, %1
+    %5:_(s32) = G_ANYEXT %4
+    $vgpr0 = COPY %5
 ...
 
 ---
@@ -343,24 +475,559 @@ name: extract_vector_elt_v4s8_varidx_i32
 
 body: |
   bb.0:
-    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
+    liveins: $vgpr0, $vgpr1
 
     ; CHECK-LABEL: name: extract_vector_elt_v4s8_varidx_i32
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[COPY]](<4 x s32>)
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>)
-    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV]], 8
-    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 8
-    ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV2]], 8
-    ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 8
-    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32)
-    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32)
-    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
-    ; CHECK: $vgpr0 = COPY [[COPY3]](s32)
-    %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    %1:_(s32) = COPY $vgpr4
-    %2:_(<4 x s8>) = G_TRUNC %0
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]]
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[SHL3]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY6]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<4 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v4s8_constidx_0_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_0_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C4]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C4]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C3]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY5]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(<4 x s8>) = G_BITCAST %0
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v4s8_constidx_1_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_1_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY5]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(<4 x s8>) = G_BITCAST %0
+    %2:_(s32) = G_CONSTANT i32 1
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v4s8_constidx_2_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_2_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY5]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(<4 x s8>) = G_BITCAST %0
+    %2:_(s32) = G_CONSTANT i32 2
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v4s8_constidx_3_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_3_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C2]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY5]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(<4 x s8>) = G_BITCAST %0
+    %2:_(s32) = G_CONSTANT i32 3
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+
+
+---
+name: extract_vector_elt_v8s8_varidx_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_varidx_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16)
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16)
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C3]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16)
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C5]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[LSHR6]](s32)
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C6]]
+    ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C6]](s32)
+    ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL6]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY6]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(<8 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+
+---
+name: extract_vector_elt_v8s8_constidx_0_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_0_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C2]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY4]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<8 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v8s8_constidx_1_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_1_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY4]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(<8 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v8s8_constidx_3_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_3_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C4]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY4]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = G_CONSTANT i32 3
+    %2:_(<8 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v8s8_constidx_4_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_4_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C5]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY4]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = G_CONSTANT i32 4
+    %2:_(<8 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v8s8_constidx_5_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_5_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY4]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = G_CONSTANT i32 5
+    %2:_(<8 x s8>) = G_BITCAST %0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v8s8_constidx_7_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_7_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C4]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY4]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = G_CONSTANT i32 7
+    %2:_(<8 x s8>) = G_BITCAST %0
     %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1
     %4:_(s32) = G_ANYEXT %3
     $vgpr0 = COPY %4
@@ -376,9 +1043,14 @@ body: |
     ; CHECK-LABEL: name: extract_vector_elt_v2s16_varidx_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s16>), [[COPY1]](s32)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16)
-    ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[SHL]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY2]](s32)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1
@@ -395,8 +1067,10 @@ body: |
 
     ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx0_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
     ; CHECK: $vgpr0 = COPY [[COPY1]](s32)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(s32) = G_CONSTANT i32 0
@@ -435,8 +1109,11 @@ body: |
 
     ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx2_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK: $vgpr0 = COPY [[DEF]](s32)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY1]](s32)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(s32) = G_CONSTANT i32 2
     %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1
@@ -579,9 +1256,16 @@ body: |
     ; CHECK-LABEL: name: extract_vector_elt_v4s16_varidx_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[COPY1]](s32)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16)
-    ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<2 x s32>), [[LSHR]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY2]](s32)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:_(s32) = COPY $vgpr2
     %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1
@@ -599,8 +1283,18 @@ body: |
     ; CHECK-LABEL: name: extract_vector_elt_v2s128_varidx_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK: [[EVEC:%[0-9]+]]:_(s128) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s128>), [[COPY1]](s32)
-    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[EVEC]](s128)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s64>) = G_BITCAST [[COPY]](<2 x s128>)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[C]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[C1]]
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<4 x s64>), [[ADD]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[C2]]
+    ; CHECK: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<4 x s64>), [[ADD1]](s32)
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[EVEC]](s64), [[EVEC1]](s64)
+    ; CHECK: [[BITCAST1:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST1]](s128)
     %0:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     %1:_(s32) = COPY $vgpr8
     %2:_(s128) = G_EXTRACT_VECTOR_ELT %0, %1
@@ -1146,3 +1840,335 @@ body: |
     %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1
     S_ENDPGM 0, implicit %3
 ...
+
+---
+name: extract_vector_elt_v32s1_varidx_i32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: extract_vector_elt_v32s1_varidx_i32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C4]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32)
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C6]](s32)
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C7]](s32)
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C8]](s32)
+    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C9]](s32)
+    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C10]](s32)
+    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C11]](s32)
+    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; CHECK: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C12]](s32)
+    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; CHECK: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C13]](s32)
+    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C14]](s32)
+    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C15]](s32)
+    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
+    ; CHECK: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C16]](s32)
+    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 18
+    ; CHECK: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C17]](s32)
+    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 19
+    ; CHECK: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C18]](s32)
+    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C19]](s32)
+    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 21
+    ; CHECK: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C20]](s32)
+    ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 22
+    ; CHECK: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C21]](s32)
+    ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; CHECK: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C22]](s32)
+    ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C23]](s32)
+    ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
+    ; CHECK: [[LSHR24:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C24]](s32)
+    ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 26
+    ; CHECK: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C25]](s32)
+    ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 27
+    ; CHECK: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C26]](s32)
+    ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; CHECK: [[LSHR27:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C27]](s32)
+    ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 29
+    ; CHECK: [[LSHR28:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C28]](s32)
+    ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK: [[LSHR29:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C29]](s32)
+    ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK: [[LSHR30:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C30]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C]]
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C3]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C]]
+    ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C]]
+    ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32)
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]]
+    ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C7]](s32)
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]]
+    ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C8]](s32)
+    ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]]
+    ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32)
+    ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C]]
+    ; CHECK: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C9]](s32)
+    ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[OR8]], [[SHL9]]
+    ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C]]
+    ; CHECK: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C10]](s32)
+    ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]]
+    ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32)
+    ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C]]
+    ; CHECK: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C11]](s32)
+    ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]]
+    ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32)
+    ; CHECK: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C]]
+    ; CHECK: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C12]](s32)
+    ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR11]], [[SHL12]]
+    ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR13]](s32)
+    ; CHECK: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C]]
+    ; CHECK: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C13]](s32)
+    ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[OR12]], [[SHL13]]
+    ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32)
+    ; CHECK: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C]]
+    ; CHECK: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C14]](s32)
+    ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[OR13]], [[SHL14]]
+    ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR15]](s32)
+    ; CHECK: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C]]
+    ; CHECK: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND16]], [[C15]](s32)
+    ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[OR14]], [[SHL15]]
+    ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32)
+    ; CHECK: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C]]
+    ; CHECK: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C16]](s32)
+    ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[OR15]], [[SHL16]]
+    ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LSHR17]](s32)
+    ; CHECK: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C]]
+    ; CHECK: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C17]](s32)
+    ; CHECK: [[OR17:%[0-9]+]]:_(s32) = G_OR [[OR16]], [[SHL17]]
+    ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR18]](s32)
+    ; CHECK: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C]]
+    ; CHECK: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[C18]](s32)
+    ; CHECK: [[OR18:%[0-9]+]]:_(s32) = G_OR [[OR17]], [[SHL18]]
+    ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LSHR19]](s32)
+    ; CHECK: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C]]
+    ; CHECK: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND20]], [[C19]](s32)
+    ; CHECK: [[OR19:%[0-9]+]]:_(s32) = G_OR [[OR18]], [[SHL19]]
+    ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LSHR20]](s32)
+    ; CHECK: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C]]
+    ; CHECK: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[C20]](s32)
+    ; CHECK: [[OR20:%[0-9]+]]:_(s32) = G_OR [[OR19]], [[SHL20]]
+    ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR21]](s32)
+    ; CHECK: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C]]
+    ; CHECK: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND22]], [[C21]](s32)
+    ; CHECK: [[OR21:%[0-9]+]]:_(s32) = G_OR [[OR20]], [[SHL21]]
+    ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LSHR22]](s32)
+    ; CHECK: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C]]
+    ; CHECK: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[C22]](s32)
+    ; CHECK: [[OR22:%[0-9]+]]:_(s32) = G_OR [[OR21]], [[SHL22]]
+    ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR23]](s32)
+    ; CHECK: [[AND24:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C]]
+    ; CHECK: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[AND24]], [[C23]](s32)
+    ; CHECK: [[OR23:%[0-9]+]]:_(s32) = G_OR [[OR22]], [[SHL23]]
+    ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LSHR24]](s32)
+    ; CHECK: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C]]
+    ; CHECK: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[C24]](s32)
+    ; CHECK: [[OR24:%[0-9]+]]:_(s32) = G_OR [[OR23]], [[SHL24]]
+    ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR25]](s32)
+    ; CHECK: [[AND26:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C]]
+    ; CHECK: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[AND26]], [[C25]](s32)
+    ; CHECK: [[OR25:%[0-9]+]]:_(s32) = G_OR [[OR24]], [[SHL25]]
+    ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR26]](s32)
+    ; CHECK: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C]]
+    ; CHECK: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[C26]](s32)
+    ; CHECK: [[OR26:%[0-9]+]]:_(s32) = G_OR [[OR25]], [[SHL26]]
+    ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY [[LSHR27]](s32)
+    ; CHECK: [[AND28:%[0-9]+]]:_(s32) = G_AND [[COPY30]], [[C]]
+    ; CHECK: [[SHL27:%[0-9]+]]:_(s32) = G_SHL [[AND28]], [[C27]](s32)
+    ; CHECK: [[OR27:%[0-9]+]]:_(s32) = G_OR [[OR26]], [[SHL27]]
+    ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LSHR28]](s32)
+    ; CHECK: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C]]
+    ; CHECK: [[SHL28:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[C28]](s32)
+    ; CHECK: [[OR28:%[0-9]+]]:_(s32) = G_OR [[OR27]], [[SHL28]]
+    ; CHECK: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR29]](s32)
+    ; CHECK: [[AND30:%[0-9]+]]:_(s32) = G_AND [[COPY32]], [[C]]
+    ; CHECK: [[SHL29:%[0-9]+]]:_(s32) = G_SHL [[AND30]], [[C29]](s32)
+    ; CHECK: [[OR29:%[0-9]+]]:_(s32) = G_OR [[OR28]], [[SHL29]]
+    ; CHECK: [[COPY33:%[0-9]+]]:_(s32) = COPY [[LSHR30]](s32)
+    ; CHECK: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY33]], [[C]]
+    ; CHECK: [[SHL30:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[C30]](s32)
+    ; CHECK: [[OR30:%[0-9]+]]:_(s32) = G_OR [[OR29]], [[SHL30]]
+    ; CHECK: [[AND32:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C30]]
+    ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[SHL31:%[0-9]+]]:_(s32) = G_SHL [[AND32]], [[C31]](s32)
+    ; CHECK: [[LSHR31:%[0-9]+]]:_(s32) = G_LSHR [[OR30]], [[SHL31]](s32)
+    ; CHECK: [[COPY34:%[0-9]+]]:_(s32) = COPY [[LSHR31]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY34]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<32 x s1>) = G_BITCAST %0
+    %3:_(s1) = G_EXTRACT_VECTOR_ELT %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v12s8_varidx_s32
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
+    ; CHECK-LABEL: name: extract_vector_elt_v12s8_varidx_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32)
+    ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
+    ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]]
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C1]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
+    ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]]
+    ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32)
+    ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]]
+    ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C1]](s32)
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32)
+    ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]]
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C4]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[LSHR9]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]]
+    ; CHECK: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C5]](s32)
+    ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL9]](s32)
+    ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY14]](s32)
+    %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+    %1:_(<12 x s8>) = G_BITCAST %0
+    %2:_(s32) = COPY $vgpr3
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: extract_vector_elt_v3s8_varidx_s32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: extract_vector_elt_v3s8_varidx_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 8
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32)
+    ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 8
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[COPY1]](s32)
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY8]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s24) = G_TRUNC %0
+    %3:_(<3 x s8>) = G_BITCAST %2
+    %4:_(s8) = G_EXTRACT_VECTOR_ELT %3, %1
+    %5:_(s32) = G_ANYEXT %4
+    $vgpr0 = COPY %5
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
index 86d7a2f4e4db..e566572763e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
@@ -16,8 +16,10 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -38,8 +40,10 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -67,12 +71,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -94,12 +100,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -130,15 +138,18 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -163,15 +174,18 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -205,15 +219,18 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -238,15 +255,18 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -279,12 +299,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -306,12 +328,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -342,15 +366,18 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -375,15 +402,18 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -417,15 +447,18 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -450,15 +483,18 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -492,18 +528,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -527,18 +566,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -571,12 +613,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -598,12 +642,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -634,15 +680,18 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -667,15 +716,18 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -709,18 +761,21 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -744,18 +799,21 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -789,18 +847,21 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -824,18 +885,21 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -869,15 +933,18 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -902,15 +969,18 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -944,18 +1014,21 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -979,18 +1052,21 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -1028,8 +1104,10 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
   ; GFX10NSA-LABEL: name: store_1d
@@ -1050,8 +1128,10 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
 main_body:
@@ -1079,12 +1159,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
@@ -1106,12 +1188,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
@@ -1142,15 +1226,18 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1175,15 +1262,18 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1217,15 +1307,18 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1250,15 +1343,18 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1291,12 +1387,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
@@ -1318,12 +1416,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
@@ -1354,15 +1454,18 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1387,15 +1490,18 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1429,15 +1535,18 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1462,15 +1571,18 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1504,18 +1616,21 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1539,18 +1654,21 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1583,12 +1701,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
@@ -1610,12 +1730,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
@@ -1646,15 +1768,18 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1679,15 +1804,18 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1721,18 +1849,21 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1756,18 +1887,21 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1801,18 +1935,21 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1836,18 +1973,21 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1881,15 +2021,18 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1914,15 +2057,18 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -1956,18 +2102,21 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -1991,18 +2140,21 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
-  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
@@ -2030,8 +2182,10 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2052,8 +2206,10 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2081,8 +2237,10 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2103,8 +2261,10 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2132,8 +2292,10 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2154,8 +2316,10 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2183,8 +2347,10 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2205,8 +2371,10 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2234,8 +2402,10 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2256,8 +2426,10 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2285,8 +2457,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2307,8 +2481,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2336,8 +2512,10 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2358,8 +2536,10 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2387,8 +2567,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2409,8 +2591,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2438,8 +2622,10 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -2456,8 +2642,10 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -2481,8 +2669,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2501,8 +2691,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2529,8 +2721,10 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
   ; GFX10NSA-LABEL: name: store_1d_V1
@@ -2547,8 +2741,10 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
 main_body:
@@ -2574,8 +2770,10 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
   ; GFX9:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
   ; GFX10NSA-LABEL: name: store_1d_V2
@@ -2594,8 +2792,10 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
 main_body:
@@ -2618,8 +2818,10 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2640,8 +2842,10 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2669,8 +2873,10 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2691,8 +2897,10 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2720,8 +2928,10 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -2742,8 +2952,10 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -2776,8 +2988,10 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
   ; GFX10NSA-LABEL: name: store_1d_glc
@@ -2798,8 +3012,10 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
 main_body:
@@ -2827,8 +3043,10 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
   ; GFX10NSA-LABEL: name: store_1d_slc
@@ -2849,8 +3067,10 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
 main_body:
@@ -2878,8 +3098,10 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX9:   S_ENDPGM 0
   ; GFX10NSA-LABEL: name: store_1d_glc_slc
@@ -2900,8 +3122,10 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8")
   ; GFX10NSA:   S_ENDPGM 0
 main_body:
@@ -2970,8 +3194,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX9:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
@@ -2994,8 +3220,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
@@ -3028,12 +3256,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
@@ -3057,12 +3287,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
@@ -3098,15 +3330,18 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX9:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -3133,15 +3368,18 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
   ; GFX10NSA:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
@@ -3180,18 +3418,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX9:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX9:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX9:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX9:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX9:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")
@@ -3217,18 +3458,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
-  ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; GFX10NSA:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
   ; GFX10NSA:   [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+  ; GFX10NSA:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
   ; GFX10NSA:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
-  ; GFX10NSA:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
-  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+  ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
   ; GFX10NSA:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8")

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index 9f614bef378e..692078edbe65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -217,16 +217,18 @@ body: |
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -250,15 +252,17 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
index d4de328b679a..d4b80ce4a572 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
@@ -45,13 +45,15 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
@@ -80,14 +82,16 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_0
@@ -115,16 +119,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -154,15 +160,17 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -191,15 +199,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -267,15 +278,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -304,13 +318,15 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32)
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
@@ -339,14 +355,16 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_2
@@ -374,16 +392,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -413,15 +433,17 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -561,15 +583,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -577,10 +602,13 @@ body: |
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -602,15 +630,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -640,15 +671,17 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -657,11 +690,13 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -683,16 +718,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -700,12 +737,14 @@ body: |
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -728,15 +767,17 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -745,11 +786,13 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
     ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -771,16 +814,18 @@ body: |
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -788,12 +833,14 @@ body: |
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -933,24 +980,38 @@ body: |
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX8: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16
-    ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 0
-    ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT]](s16)
-    ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT1]](s16)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C1]](s32)
+    ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 0
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_0
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX9: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16
-    ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 0
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16)
-    ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16)
-    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32)
+    ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C1]](s32)
+    ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 0
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
@@ -971,24 +1032,36 @@ body: |
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX8: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16
-    ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 48
-    ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT]](s16)
-    ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT1]](s16)
+    ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX8: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-    ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C]](s32)
+    ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 32
+    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX9: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16
-    ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 48
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16)
-    ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16)
-    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32)
+    ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX9: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C]](s32)
+    ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>)
+    ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 32
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = COPY $vgpr2_vgpr3


        


More information about the llvm-commits mailing list