[llvm] e2f1b48 - GlobalISel: Implement bitcast action for G_INSERT_VECTOR_ELT

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 11 07:39:21 PDT 2020


Author: Matt Arsenault
Date: 2020-08-11T10:39:14-04:00
New Revision: e2f1b48f867d9ed349b7520dea495717bca26f31

URL: https://github.com/llvm/llvm-project/commit/e2f1b48f867d9ed349b7520dea495717bca26f31
DIFF: https://github.com/llvm/llvm-project/commit/e2f1b48f867d9ed349b7520dea495717bca26f31.diff

LOG: GlobalISel: Implement bitcast action for G_INSERT_VECTOR_ELT

This mirrors the support for the equivalent extracts. This also
creates a huge mess that would be greatly improved if we had any bit
operation combines.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 2057be8ed454..471aedca3786 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -318,6 +318,10 @@ class LegalizerHelper {
   LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
                                          LLT CastTy);
 
+  /// Perform Bitcast legalize action on G_INSERT_VECTOR_ELT.
+  LegalizeResult bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                        LLT CastTy);
+
   LegalizeResult lowerBitcast(MachineInstr &MI);
   LegalizeResult lowerLoad(MachineInstr &MI);
   LegalizeResult lowerStore(MachineInstr &MI);

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 82024583e8a4..0b07dd044b80 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2369,6 +2369,28 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) {
   return UnableToLegalize;
 }
 
+/// Figure out the bit offset into a register when coercing a vector index for
+/// the wide element type. This is only for the case when promoting vector to
+/// one with larger elements.
+//
+///
+/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
+/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
+static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
+                                                   Register Idx,
+                                                   unsigned NewEltSize,
+                                                   unsigned OldEltSize) {
+  const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+  LLT IdxTy = B.getMRI()->getType(Idx);
+
+  // Now figure out the amount we need to shift to get the target bits.
+  auto OffsetMask = B.buildConstant(
+    IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
+  auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
+  return B.buildShl(IdxTy, OffsetIdx,
+                    B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
+}
+
 /// Perform a G_EXTRACT_VECTOR_ELT in a 
diff erent sized vector element. If this
 /// is casting to a vector with a smaller element size, perform multiple element
 /// extracts and merge the results. If this is coercing to a vector with larger
@@ -2467,13 +2489,9 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
                                                      ScaledIdx).getReg(0);
     }
 
-    // Now figure out the amount we need to shift to get the target bits.
-    auto OffsetMask = MIRBuilder.buildConstant(
-      IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
-    auto OffsetIdx = MIRBuilder.buildAnd(IdxTy, Idx, OffsetMask);
-    auto OffsetBits = MIRBuilder.buildShl(
-      IdxTy, OffsetIdx,
-      MIRBuilder.buildConstant(IdxTy, Log2_32(OldEltSize)));
+    // Compute the bit offset into the register of the target element.
+    Register OffsetBits = getBitcastWiderVectorElementOffset(
+      MIRBuilder, Idx, NewEltSize, OldEltSize);
 
     // Shift the wide element to get the target element.
     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
@@ -2485,6 +2503,104 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
   return UnableToLegalize;
 }
 
+/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
+/// TargetReg, while preserving other bits in \p TargetReg.
+///
+/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
+static Register buildBitFieldInsert(MachineIRBuilder &B,
+                                    Register TargetReg, Register InsertReg,
+                                    Register OffsetBits) {
+  LLT TargetTy = B.getMRI()->getType(TargetReg);
+  LLT InsertTy = B.getMRI()->getType(InsertReg);
+  auto ZextVal = B.buildZExt(TargetTy, InsertReg);
+  auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
+
+  // Produce a bitmask of the value to insert
+  auto EltMask = B.buildConstant(
+    TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
+                                   InsertTy.getSizeInBits()));
+  // Shift it into position
+  auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
+  auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
+
+  // Clear out the bits in the wide element
+  auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
+
+  // The value to insert has all zeros already, so stick it into the masked
+  // wide element.
+  return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
+}
+
+/// Perform a G_INSERT_VECTOR_ELT in a 
diff erent sized vector element. If this
+/// is increasing the element size, perform the indexing in the target element
+/// type, and use bit operations to insert at the element position. This is
+/// intended for architectures that can dynamically index the register file and
+/// want to force indexing in the native register size.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                        LLT CastTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register Val = MI.getOperand(2).getReg();
+  Register Idx = MI.getOperand(3).getReg();
+
+  LLT VecTy = MRI.getType(Dst);
+  LLT ValTy = MRI.getType(Val);
+  LLT IdxTy = MRI.getType(Idx);
+
+  LLT VecEltTy = VecTy.getElementType();
+  LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
+  const unsigned NewEltSize = NewEltTy.getSizeInBits();
+  const unsigned OldEltSize = VecEltTy.getSizeInBits();
+
+  unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
+  unsigned OldNumElts = VecTy.getNumElements();
+
+  Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
+  if (NewNumElts < OldNumElts) {
+    if (NewEltSize % OldEltSize != 0)
+      return UnableToLegalize;
+
+    // This only depends on powers of 2 because we use bit tricks to figure out
+    // the bit offset we need to shift to get the target element. A general
+    // expansion could emit division/multiply.
+    if (!isPowerOf2_32(NewEltSize / OldEltSize))
+      return UnableToLegalize;
+
+    const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+    auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
+
+    // Divide to get the index in the wider element type.
+    auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
+
+    Register ExtractedElt = CastVec;
+    if (CastTy.isVector()) {
+      ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
+                                                          ScaledIdx).getReg(0);
+    }
+
+    // Compute the bit offset into the register of the target element.
+    Register OffsetBits = getBitcastWiderVectorElementOffset(
+      MIRBuilder, Idx, NewEltSize, OldEltSize);
+
+    Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
+                                               Val, OffsetBits);
+    if (CastTy.isVector()) {
+      InsertedElt = MIRBuilder.buildInsertVectorElement(
+        CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
+    }
+
+    MIRBuilder.buildBitcast(Dst, InsertedElt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerLoad(MachineInstr &MI) {
   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
@@ -2674,6 +2790,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   }
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
+  case TargetOpcode::G_INSERT_VECTOR_ELT:
+    return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
   default:
     return UnableToLegalize;
   }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 79b559a08de6..40be9b75d79f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1338,11 +1338,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                   VecTy.getSizeInBits() <= MaxRegisterSize &&
                   IdxTy.getSizeInBits() == 32;
         })
-      .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
-                 bitcastToVectorElement32(1))
+      .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
+                 bitcastToVectorElement32(VecTypeIdx))
       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
       .bitcastIf(
-        all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
+        all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
         [=](const LegalityQuery &Query) {
           // For > 64-bit element types, try to turn this into a 64-bit
           // element vector since we may be able to do better indexing

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
new file mode 100644
index 000000000000..008b09d96887
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -0,0 +1,3306 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+
+define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v2i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NEXT:    s_and_b32 s1, s5, 1
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_and_b32 s3, s4, s2
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX9-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX9-NEXT:    s_or_b32 s0, s0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    s_and_b32 s1, s5, 1
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_and_b32 s3, s4, s2
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX8-NEXT:    s_or_b32 s0, s0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    s_and_b32 s1, s5, 1
+; GFX7-NEXT:    s_mov_b32 s2, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    s_and_b32 s3, s4, s2
+; GFX7-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX7-NEXT:    s_or_b32 s0, s0, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v2i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_and_b32 s1, s3, 1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s2, s0
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_and_b32 s1, s3, 1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_and_b32 s1, s3, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    s_and_b32 s2, s2, s0
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v2i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NEXT:    s_and_b32 s1, s4, 1
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_andn2_b32 s0, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    s_and_b32 s1, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    s_and_b32 s1, s4, 1
+; GFX7-NEXT:    s_mov_b32 s2, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX7-NEXT:    v_or_b32_e32 v2, s0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v2i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-NEXT:    s_and_b32 s2, s4, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, s0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_mov_b32 s1, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT:    s_and_b32 s2, s4, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s1, 0xffff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT:    s_and_b32 s2, s4, s1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v2i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, s0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    s_mov_b32 s1, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    s_mov_b32 s1, 0xffff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v2i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v0, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v2i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v2i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v0, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
+  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
+  ret void
+}
+
+; FIXME: 3 element load/store legalization
+; define amdgpu_ps void @insertelement_s_v3i16_s_s(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i16_s_s(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(1 )* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_s_v3i16_v_s(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_s_v3i16_s_v(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_s_v3i16_v_v(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i16_s_v(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i16_v_s(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i16_v_v(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
+;   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr
+;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
+;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
+;   ret void
+; }
+
+define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v4i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_lshr_b32 s1, s3, 1
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s2, s0
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_lshr_b32 s1, s3, 1
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_lshr_b32 s1, s3, 1
+; GFX7-NEXT:    s_and_b32 s3, s3, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX7-NEXT:    s_and_b32 s2, s2, s0
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v4i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX9-NEXT:    s_lshl_b32 s5, s5, s4
+; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v4i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX8-NEXT:    s_mov_b32 s5, 0xffff
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
+; GFX8-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v4i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX7-NEXT:    s_mov_b32 s5, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX7-NEXT:    s_and_b32 s4, s4, 1
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX7-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
+; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v4i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-NEXT:    s_and_b32 s3, s4, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v4i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT:    s_and_b32 s3, s4, s2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v4i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT:    s_and_b32 s3, s4, s2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshl_b32_e32 v3, s3, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v4i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v4i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v4i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    s_mov_b32 s2, 0xffff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, s0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v4i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v4, v2, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v5, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v4, s1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v4i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_lshr_b32 s1, s2, 1
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_lshr_b32 s1, s2, 1
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_lshr_b32 s1, s2, 1
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v4i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v5, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
+  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
+  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v8i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-NEXT:    s_lshr_b32 s6, s5, 1
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX9-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX9-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 4
+; GFX9-NEXT:    s_and_b32 s4, s4, s8
+; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX9-NEXT:    s_lshl_b32 s5, s8, s5
+; GFX9-NEXT:    s_andn2_b32 s5, s7, s5
+; GFX9-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX8-NEXT:    s_lshr_b32 s6, s5, 1
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX8-NEXT:    s_mov_b32 s8, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX8-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX8-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 4
+; GFX8-NEXT:    s_and_b32 s4, s4, s8
+; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX8-NEXT:    s_lshl_b32 s5, s8, s5
+; GFX8-NEXT:    s_andn2_b32 s5, s7, s5
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX8-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX7-NEXT:    s_lshr_b32 s6, s5, 1
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX7-NEXT:    s_mov_b32 s8, 0xffff
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX7-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX7-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX7-NEXT:    s_and_b32 s5, s5, 1
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 4
+; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX7-NEXT:    s_lshl_b32 s5, s8, s5
+; GFX7-NEXT:    s_andn2_b32 s5, s7, s5
+; GFX7-NEXT:    s_or_b32 s4, s5, s4
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX7-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX7-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX7-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX7-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v8i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_and_b32 s1, s3, 1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_and_b32 s2, s2, s0
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_not_b32 s5, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s5, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_and_b32 s1, s3, 1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s5, s2, s1
+; GFX8-NEXT:    s_not_b32 s6, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, s5, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_and_b32 s1, s3, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshr_b32 s4, s3, 1
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    s_and_b32 s2, s2, s0
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX7-NEXT:    s_lshl_b32 s5, s2, s1
+; GFX7-NEXT:    s_not_b32 s6, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v8i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 1
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX9-NEXT:    s_mov_b32 s7, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cselect_b32 s6, s1, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX9-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX9-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s4
+; GFX9-NEXT:    s_andn2_b32 s6, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX8-NEXT:    s_lshr_b32 s5, s4, 1
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX8-NEXT:    s_mov_b32 s7, 0xffff
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cselect_b32 s6, s1, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX8-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX8-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    s_lshl_b32 s4, s7, s4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX8-NEXT:    v_or_b32_e32 v4, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 1
+; GFX7-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX7-NEXT:    s_mov_b32 s7, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_cselect_b32 s6, s1, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX7-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX7-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX7-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX7-NEXT:    s_and_b32 s4, s4, 1
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX7-NEXT:    s_lshl_b32 s4, s7, s4
+; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v8i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-NEXT:    s_and_b32 s4, s4, s5
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_or_b32 v5, v1, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_mov_b32 s5, 0xffff
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT:    s_and_b32 s4, s4, s5
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v5, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT:    s_and_b32 s4, s4, s5
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s11
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    v_mov_b32_e32 v3, s11
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v8i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_and_or_b32 v5, v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    s_mov_b32 s8, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    s_mov_b32 s8, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v8i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v7, v7, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT:    v_or_b32_e32 v7, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v8i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_not_b32 s5, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v7, v1, s5, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_not_b32 s5, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT:    v_or_b32_e32 v7, v1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_lshr_b32 s4, s2, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX7-NEXT:    s_not_b32 s5, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT:    v_or_b32_e32 v7, v1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v8i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
+  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v16i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX9-NEXT:    s_lshr_b32 s7, s5, 1
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX9-NEXT:    s_cselect_b32 s0, s11, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
+; GFX9-NEXT:    s_cselect_b32 s0, s12, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
+; GFX9-NEXT:    s_cselect_b32 s0, s13, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX9-NEXT:    s_cselect_b32 s0, s14, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
+; GFX9-NEXT:    s_and_b32 s1, s5, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_and_b32 s3, s4, s2
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX9-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX9-NEXT:    s_or_b32 s16, s0, s3
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s16, s8
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX9-NEXT:    s_cselect_b32 s1, s16, s9
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX9-NEXT:    s_cselect_b32 s2, s16, s10
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX9-NEXT:    s_cselect_b32 s3, s16, s11
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
+; GFX9-NEXT:    s_cselect_b32 s4, s16, s12
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
+; GFX9-NEXT:    s_cselect_b32 s5, s16, s13
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_cselect_b32 s6, s16, s14
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX9-NEXT:    s_cselect_b32 s7, s16, s15
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    v_mov_b32_e32 v7, s7
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX8-NEXT:    s_lshr_b32 s7, s5, 1
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cselect_b32 s0, s9, s8
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX8-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX8-NEXT:    s_cselect_b32 s0, s11, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 4
+; GFX8-NEXT:    s_cselect_b32 s0, s12, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 5
+; GFX8-NEXT:    s_cselect_b32 s0, s13, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX8-NEXT:    s_cselect_b32 s0, s14, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX8-NEXT:    s_cselect_b32 s0, s15, s0
+; GFX8-NEXT:    s_and_b32 s1, s5, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_and_b32 s3, s4, s2
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX8-NEXT:    s_or_b32 s16, s0, s3
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
+; GFX8-NEXT:    s_cselect_b32 s0, s16, s8
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX8-NEXT:    s_cselect_b32 s1, s16, s9
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX8-NEXT:    s_cselect_b32 s2, s16, s10
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX8-NEXT:    s_cselect_b32 s3, s16, s11
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 4
+; GFX8-NEXT:    s_cselect_b32 s4, s16, s12
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 5
+; GFX8-NEXT:    s_cselect_b32 s5, s16, s13
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_cselect_b32 s6, s16, s14
+; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX8-NEXT:    s_cselect_b32 s7, s16, s15
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v7, s7
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 1
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX7-NEXT:    s_mov_b32 s2, 0xffff
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_cselect_b32 s0, s9, s8
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX7-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX7-NEXT:    s_cselect_b32 s0, s11, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 4
+; GFX7-NEXT:    s_cselect_b32 s0, s12, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 5
+; GFX7-NEXT:    s_cselect_b32 s0, s13, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX7-NEXT:    s_cselect_b32 s0, s14, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX7-NEXT:    s_cselect_b32 s0, s15, s0
+; GFX7-NEXT:    s_and_b32 s1, s5, 1
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    s_and_b32 s3, s4, s2
+; GFX7-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX7-NEXT:    s_or_b32 s16, s0, s3
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 0
+; GFX7-NEXT:    s_cselect_b32 s0, s16, s8
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 1
+; GFX7-NEXT:    s_cselect_b32 s1, s16, s9
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 2
+; GFX7-NEXT:    s_cselect_b32 s2, s16, s10
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX7-NEXT:    s_cselect_b32 s3, s16, s11
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 4
+; GFX7-NEXT:    s_cselect_b32 s4, s16, s12
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 5
+; GFX7-NEXT:    s_cselect_b32 s5, s16, s13
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_cselect_b32 s6, s16, s14
+; GFX7-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX7-NEXT:    s_cselect_b32 s7, s16, s15
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v7, s7
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v16i16_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX9-NEXT:    s_and_b32 s1, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s12, s3, 1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_and_b32 s2, s2, s0
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_not_b32 s13, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
+; GFX9-NEXT:    v_and_or_b32 v10, v1, s13, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v10, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[10:11]
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i16_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    s_and_b32 s1, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s12, s3, 1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GFX8-NEXT:    s_lshl_b32 s13, s2, s1
+; GFX8-NEXT:    s_not_b32 s14, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
+; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v3, s[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v7, s[10:11]
+; GFX8-NEXT:    v_and_b32_e32 v8, s14, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, s13, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i16_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s18, 0
+; GFX7-NEXT:    s_mov_b32 s19, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[16:17], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_and_b32 s1, s3, 1
+; GFX7-NEXT:    s_lshr_b32 s12, s3, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    s_and_b32 s2, s2, s0
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GFX7-NEXT:    s_lshl_b32 s13, s2, s1
+; GFX7-NEXT:    s_not_b32 s14, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
+; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[10:11]
+; GFX7-NEXT:    v_and_b32_e32 v0, s14, v0
+; GFX7-NEXT:    v_or_b32_e32 v10, s13, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v10, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v16i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX9-NEXT:    s_mov_b32 s3, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 2
+; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 3
+; GFX9-NEXT:    s_cselect_b32 s0, s11, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 4
+; GFX9-NEXT:    s_cselect_b32 s0, s12, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 5
+; GFX9-NEXT:    s_cselect_b32 s0, s13, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 6
+; GFX9-NEXT:    s_cselect_b32 s0, s14, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 7
+; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
+; GFX9-NEXT:    s_and_b32 s1, s4, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
+; GFX9-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_lshl_or_b32 v8, v0, s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s13
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, s15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cselect_b32 s0, s9, s8
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 2
+; GFX8-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 3
+; GFX8-NEXT:    s_cselect_b32 s0, s11, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 4
+; GFX8-NEXT:    s_cselect_b32 s0, s12, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 5
+; GFX8-NEXT:    s_cselect_b32 s0, s13, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 6
+; GFX8-NEXT:    s_cselect_b32 s0, s14, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 7
+; GFX8-NEXT:    s_cselect_b32 s0, s15, s0
+; GFX8-NEXT:    s_and_b32 s1, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v8, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, s13
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, s15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX7-NEXT:    s_mov_b32 s3, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_cselect_b32 s0, s9, s8
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 2
+; GFX7-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 3
+; GFX7-NEXT:    s_cselect_b32 s0, s11, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 4
+; GFX7-NEXT:    s_cselect_b32 s0, s12, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 5
+; GFX7-NEXT:    s_cselect_b32 s0, s13, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 6
+; GFX7-NEXT:    s_cselect_b32 s0, s14, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 7
+; GFX7-NEXT:    s_cselect_b32 s0, s15, s0
+; GFX7-NEXT:    s_and_b32 s1, s4, 1
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX7-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX7-NEXT:    v_or_b32_e32 v8, s0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s11
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v5, s13
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v4, s12
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
+; GFX7-NEXT:    v_mov_b32_e32 v6, s14
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v16i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mov_b32_e32 v3, s18
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s19
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s20
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v6, s21
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX9-NEXT:    v_mov_b32_e32 v7, s22
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-NEXT:    s_and_b32 s4, s4, s5
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v9, s23
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_or_b32 v9, v1, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    v_mov_b32_e32 v4, s20
+; GFX9-NEXT:    v_mov_b32_e32 v5, s21
+; GFX9-NEXT:    v_mov_b32_e32 v6, s22
+; GFX9-NEXT:    v_mov_b32_e32 v7, s23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s20
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_mov_b32 s5, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v6, s21
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX8-NEXT:    v_mov_b32_e32 v7, s22
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT:    s_and_b32 s4, s4, s5
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v9, s23
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v9, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_mov_b32_e32 v5, s21
+; GFX8-NEXT:    v_mov_b32_e32 v6, s22
+; GFX8-NEXT:    v_mov_b32_e32 v7, s23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s16
+; GFX7-NEXT:    v_mov_b32_e32 v2, s17
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v4, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v5, s20
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0xffff
+; GFX7-NEXT:    v_mov_b32_e32 v6, s21
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX7-NEXT:    v_mov_b32_e32 v7, s22
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT:    s_and_b32 s4, s4, s5
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_mov_b32_e32 v9, s23
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v9, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s16
+; GFX7-NEXT:    v_mov_b32_e32 v1, s17
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    v_mov_b32_e32 v3, s19
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
+; GFX7-NEXT:    v_mov_b32_e32 v5, s21
+; GFX7-NEXT:    v_mov_b32_e32 v6, s22
+; GFX7-NEXT:    v_mov_b32_e32 v7, s23
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v16i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s16
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s17
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, s18
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX9-NEXT:    s_mov_b32 s20, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
+; GFX9-NEXT:    v_mov_b32_e32 v10, s19
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_and_or_b32 v9, v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    v_mov_b32_e32 v5, s17
+; GFX9-NEXT:    v_mov_b32_e32 v6, s18
+; GFX9-NEXT:    v_mov_b32_e32 v7, s19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NEXT:    v_mov_b32_e32 v4, s14
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v6, s16
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, s17
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, s18
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX8-NEXT:    s_mov_b32 s20, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
+; GFX8-NEXT:    v_mov_b32_e32 v10, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s17
+; GFX8-NEXT:    v_mov_b32_e32 v6, s18
+; GFX8-NEXT:    v_mov_b32_e32 v7, s19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s12
+; GFX7-NEXT:    v_mov_b32_e32 v3, s13
+; GFX7-NEXT:    v_mov_b32_e32 v4, s14
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v5, s15
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v6, s16
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    s_mov_b32 s20, 0xffff
+; GFX7-NEXT:    v_mov_b32_e32 v7, s17
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX7-NEXT:    v_mov_b32_e32 v9, s18
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s20, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s20, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s12
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mov_b32_e32 v2, s14
+; GFX7-NEXT:    v_mov_b32_e32 v3, s15
+; GFX7-NEXT:    v_mov_b32_e32 v4, s16
+; GFX7-NEXT:    v_mov_b32_e32 v5, s17
+; GFX7-NEXT:    v_mov_b32_e32 v6, s18
+; GFX7-NEXT:    v_mov_b32_e32 v7, s19
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v16i16_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
+; GFX9-NEXT:    v_and_or_b32 v11, v11, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i16_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
+; GFX8-NEXT:    v_and_b32_e32 v1, v11, v1
+; GFX8-NEXT:    v_or_b32_e32 v11, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i16_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s18, 0
+; GFX7-NEXT:    s_mov_b32 s19, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[16:17], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
+; GFX7-NEXT:    v_and_b32_e32 v1, v11, v1
+; GFX7-NEXT:    v_or_b32_e32 v11, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v16i16_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_not_b32 s13, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
+; GFX9-NEXT:    v_and_or_b32 v11, v1, s13, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i16_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
+; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_not_b32 s13, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
+; GFX8-NEXT:    v_and_b32_e32 v1, s13, v1
+; GFX8-NEXT:    v_or_b32_e32 v11, v1, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i16_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s18, 0
+; GFX7-NEXT:    s_mov_b32 s19, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[16:17], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX7-NEXT:    s_not_b32 s13, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
+; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
+; GFX7-NEXT:    v_and_b32_e32 v1, s13, v1
+; GFX7-NEXT:    v_or_b32_e32 v11, v1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v16i16_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX9-NEXT:    v_and_or_b32 v12, v3, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, 0, 16
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_addc_u32 s1, 0, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v11, s1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i16_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v12, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
+; GFX8-NEXT:    s_add_u32 s0, 0, 16
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    s_addc_u32 s1, 0, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i16_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s18, 0
+; GFX7-NEXT:    s_mov_b32 s19, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[16:17], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v12, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
+  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
+  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
new file mode 100644
index 000000000000..7c3e74dfcf69
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -0,0 +1,5909 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+
+define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v2i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v2i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v2i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    global_load_ushort v1, v[1:2], off
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    flat_load_ushort v1, v[1:2]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v2i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    global_load_ushort v1, v[1:2], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    flat_load_ushort v1, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v2i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v2i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v2i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s0, s2
+; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v2i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v3, s2
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v2i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v2i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v2i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v2i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr
+  %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
+  store <2 x i8> %insert, <2 x i8> addrspace(1)* null
+  ret void
+}
+
+; FIXME: 3 element load/store legalization
+; define amdgpu_ps void @insertelement_s_v3i8_s_s(<3 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i8_s_s(<3 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(1 )* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_s_v3i8_v_s(<3 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_s_v3i8_s_v(<3 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_s_v3i8_v_v(<3 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i8_s_v(<3 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(1)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i8_v_s(<3 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(1)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+; define amdgpu_ps void @insertelement_v_v3i8_v_v(<3 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) {
+;   %vec = load <3 x i8>, <3 x i8> addrspace(1)* %ptr
+;   %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
+;   store <3 x i8> %insert, <3 x i8> addrspace(1)* null
+;   ret void
+; }
+
+define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v4i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    s_and_b32 s3, s3, 3
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s1, s3
+; GFX9-NEXT:    s_not_b32 s3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v3
+; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
+; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    s_and_b32 s1, s3, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    s_and_b32 s1, s3, 3
+; GFX7-NEXT:    s_and_b32 s2, s2, s0
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, s0, s1
+; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v4i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s6
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s6
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, s6
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX9-NEXT:    s_lshl_b32 s3, s6, s2
+; GFX9-NEXT:    s_andn2_b32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, s2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s6, v1
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v2, v1, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v4i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s5, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s3, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s5, s1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v4i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s5, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX7-NEXT:    s_and_b32 s1, s1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s5
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, s5
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s3, s5
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX7-NEXT:    s_lshl_b32 s1, s5, s1
+; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v4i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s6
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s6
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, s6
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s4, s6
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s6
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_or_b32 v0, s1, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s6, v1
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v2, v1, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v4i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s5, 0xff
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s3, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s4, s5
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v4i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s5, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX7-NEXT:    s_and_b32 s1, s1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s5
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, s5
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s3, s5
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s4, s5
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s5, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v4i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s5
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s4, s5
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_and_or_b32 v0, s1, v1, v0
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
+; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v2, v1, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v4i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s4
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s3, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v4i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX7-NEXT:    s_and_b32 s1, s1, s4
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s2, s4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s3, s4
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s4, v1
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v4i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v2, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s1
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v4
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
+; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v2
+; GFX7-NEXT:    s_and_b32 s1, s2, s0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v4i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_and_b32 s2, s2, 3
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_lshl_b32 s2, s1, s2
+; GFX9-NEXT:    s_not_b32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v3
+; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
+; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
+; GFX7-NEXT:    s_and_b32 s1, s2, 3
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
+; GFX7-NEXT:    s_lshl_b32 s1, s0, s1
+; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v4i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s1
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v4i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_mov_b32_e32 v6, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v4i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s2, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, s2, v3
+; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, s2, v5
+; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, s2, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, v3, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, v4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
+  %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
+  store <4 x i8> %insert, <4 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v8i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s10, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s10
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s6, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s7, s10
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s8, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s9, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s5, 2
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX9-NEXT:    s_and_b32 s5, s5, 3
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX9-NEXT:    s_and_b32 s4, s4, s10
+; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX9-NEXT:    s_lshl_b32 s5, s10, s5
+; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s10
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s4, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, s10
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s6, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s7, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s10, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX8-NEXT:    s_and_b32 s2, s2, s10
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s3, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s6, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s7, s10
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s8, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s9, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s5, 2
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX8-NEXT:    s_and_b32 s5, s5, 3
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX8-NEXT:    s_and_b32 s4, s4, s10
+; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX8-NEXT:    s_lshl_b32 s5, s10, s5
+; GFX8-NEXT:    s_andn2_b32 s3, s3, s5
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX8-NEXT:    s_and_b32 s2, s2, s10
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s3, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s4, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s5, s10
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s6, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s7, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s10, 0xff
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX7-NEXT:    s_and_b32 s2, s2, s10
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s3, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s6, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s7, s10
+; GFX7-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX7-NEXT:    s_and_b32 s1, s1, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s8, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s9, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_lshr_b32 s2, s5, 2
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX7-NEXT:    s_and_b32 s5, s5, 3
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX7-NEXT:    s_and_b32 s4, s4, s10
+; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX7-NEXT:    s_lshl_b32 s5, s10, s5
+; GFX7-NEXT:    s_andn2_b32 s3, s3, s5
+; GFX7-NEXT:    s_or_b32 s3, s3, s4
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX7-NEXT:    s_cselect_b32 s4, s3, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX7-NEXT:    s_cselect_b32 s3, s3, s1
+; GFX7-NEXT:    s_lshr_b32 s2, s4, 8
+; GFX7-NEXT:    s_and_b32 s2, s2, s10
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX7-NEXT:    s_and_b32 s4, s4, s10
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s2, s4, s2
+; GFX7-NEXT:    s_and_b32 s4, s5, s10
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX7-NEXT:    s_or_b32 s2, s2, s4
+; GFX7-NEXT:    s_and_b32 s4, s6, s10
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s7, s3, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s4
+; GFX7-NEXT:    s_and_b32 s4, s7, s10
+; GFX7-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s9, s3, 24
+; GFX7-NEXT:    s_and_b32 s3, s3, s10
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX7-NEXT:    s_or_b32 s3, s3, s4
+; GFX7-NEXT:    s_and_b32 s4, s8, s10
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX7-NEXT:    s_or_b32 s3, s3, s4
+; GFX7-NEXT:    s_and_b32 s4, s9, s10
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX7-NEXT:    s_or_b32 s3, s3, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v8i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    s_lshr_b32 s1, s3, 2
+; GFX9-NEXT:    s_and_b32 s3, s3, 3
+; GFX9-NEXT:    s_and_b32 s2, s2, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX9-NEXT:    s_not_b32 s3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v4
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v3, v4, s3, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v3
+; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_lshr_b32 s1, s3, 2
+; GFX8-NEXT:    s_and_b32 s3, s3, 3
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s6, 0xff
+; GFX7-NEXT:    s_and_b32 s1, s3, 3
+; GFX7-NEXT:    s_lshr_b32 s0, s3, 2
+; GFX7-NEXT:    s_and_b32 s2, s2, s6
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s1, s6, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v8i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s10, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v0, s10, v0
+; GFX9-NEXT:    s_mov_b32 s5, 8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s10
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s6, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s7, s10
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s8, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s9, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s4, 2
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX9-NEXT:    s_and_b32 s4, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s6, s10, s4
+; GFX9-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s10, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v2, v4, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v2, v1, s10, v2
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s9, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX8-NEXT:    s_and_b32 s2, s2, s9
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s3, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s5, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s6, s9
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s7, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s8, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s4, 2
+; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX8-NEXT:    s_and_b32 s4, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    s_lshl_b32 s4, s9, s4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
+; GFX8-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s9, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, s9, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX7-NEXT:    s_and_b32 s2, s2, s9
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s3, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s5, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s6, s9
+; GFX7-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX7-NEXT:    s_and_b32 s1, s1, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s7, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s8, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_lshr_b32 s2, s4, 2
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
+; GFX7-NEXT:    s_and_b32 s4, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX7-NEXT:    s_lshl_b32 s4, s9, s4
+; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
+; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s9, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s9, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v8i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s10, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s10
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s6, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s7, s10
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s8, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s9, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_and_b32 s2, s4, s10
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s10
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s5, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s10, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v2, v4, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v2, v1, s10, v2
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s9, 0xff
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX8-NEXT:    s_and_b32 s2, s2, s9
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s3, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s5, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s6, s9
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s7, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s8, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_and_b32 s2, s4, s9
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s9
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s9, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX7-NEXT:    s_and_b32 s2, s2, s9
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s3, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s5, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s6, s9
+; GFX7-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX7-NEXT:    s_and_b32 s1, s1, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s7, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s8, s9
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    s_and_b32 s2, s4, s9
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshl_b32_e32 v3, s2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s9, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s9, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s9, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v8i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s9, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s9
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s3, s9
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, s9
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s6, s9
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s9
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s7, s9
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s8, s9
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s9, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v2, v4, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v2, v1, s9, v2
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v8i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s8, 0xff
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX8-NEXT:    s_and_b32 s2, s2, s8
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s3, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s4, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s5, s8
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s6, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s7, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v8i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s8, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX7-NEXT:    s_and_b32 s2, s2, s8
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s8
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s3, s8
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s4, s8
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s2, s5, s8
+; GFX7-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX7-NEXT:    s_and_b32 s1, s1, s8
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s6, s8
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s7, s8
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_or_b32 s1, s1, s2
+; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s8, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v8i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    s_and_b32 s1, s2, s3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v5, v2, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v6
+; GFX9-NEXT:    v_and_b32_sdwa v10, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v7
+; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v9
+; GFX9-NEXT:    v_or3_b32 v1, v1, v10, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v6, v2, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v2
+; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v12, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s3, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_and_b32 s0, s2, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v4, s0, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, s3, v5
+; GFX7-NEXT:    v_and_b32_e32 v8, s3, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, s3, v6
+; GFX7-NEXT:    v_and_b32_e32 v9, s3, v9
+; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
+; GFX7-NEXT:    v_and_b32_e32 v10, s3, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s3, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s3, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, s3, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v8i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    s_lshr_b32 s1, s2, 2
+; GFX9-NEXT:    s_and_b32 s2, s2, 3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX9-NEXT:    s_not_b32 s2, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v4
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v4, s2, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v2
+; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_lshr_b32 s1, s2, 2
+; GFX8-NEXT:    s_and_b32 s2, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX8-NEXT:    v_mov_b32_e32 v6, s2
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
+; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s3, 0xff
+; GFX7-NEXT:    s_and_b32 s1, s2, 3
+; GFX7-NEXT:    s_lshr_b32 s0, s2, 2
+; GFX7-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v2
+; GFX7-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s3, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, s3, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
+; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, s3, v5
+; GFX7-NEXT:    v_and_b32_e32 v8, s3, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s3, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s3, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, s3, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v8i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v9, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v7
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX9-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v7, v3, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v2
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v9
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v8i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, s0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 2, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v8i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX7-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, s0, v7
+; GFX7-NEXT:    v_and_b32_e32 v10, s0, v10
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX7-NEXT:    v_and_b32_e32 v11, s0, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, v6, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, v6, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, v7, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, v5, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, v8, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
+  %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
+  store <8 x i8> %insert, <8 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v16i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s18, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX9-NEXT:    s_and_b32 s6, s6, s18
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s7, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s8, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s9, s18
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s6, s10, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s6, s11, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s6, s12, s18
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX9-NEXT:    s_and_b32 s2, s2, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s13, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s14, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s15, s18
+; GFX9-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX9-NEXT:    s_and_b32 s3, s3, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_and_b32 s6, s16, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_and_b32 s6, s17, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s5, 2
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX9-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX9-NEXT:    s_and_b32 s5, s5, 3
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX9-NEXT:    s_and_b32 s4, s4, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX9-NEXT:    s_lshl_b32 s5, s18, s5
+; GFX9-NEXT:    s_andn2_b32 s5, s7, s5
+; GFX9-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 8
+; GFX9-NEXT:    s_and_b32 s4, s4, s18
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_and_b32 s4, s5, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_and_b32 s4, s6, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_and_b32 s4, s7, s18
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s4, s8, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s4, s9, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s4, s10, s18
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX9-NEXT:    s_and_b32 s2, s2, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s11, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s12, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s13, s18
+; GFX9-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s15, s3, 24
+; GFX9-NEXT:    s_and_b32 s3, s3, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-NEXT:    s_and_b32 s4, s14, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-NEXT:    s_and_b32 s4, s15, s18
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s18, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX8-NEXT:    s_and_b32 s6, s6, s18
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s6, s7, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s6, s8, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s6, s9, s18
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s6, s10, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s6, s11, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s6, s12, s18
+; GFX8-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX8-NEXT:    s_and_b32 s2, s2, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_and_b32 s6, s13, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_and_b32 s6, s14, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_and_b32 s6, s15, s18
+; GFX8-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX8-NEXT:    s_and_b32 s3, s3, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_or_b32 s3, s3, s6
+; GFX8-NEXT:    s_and_b32 s6, s16, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s6
+; GFX8-NEXT:    s_and_b32 s6, s17, s18
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_or_b32 s3, s3, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s5, 2
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX8-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX8-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX8-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX8-NEXT:    s_and_b32 s5, s5, 3
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX8-NEXT:    s_and_b32 s4, s4, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX8-NEXT:    s_lshl_b32 s5, s18, s5
+; GFX8-NEXT:    s_andn2_b32 s5, s7, s5
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX8-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 8
+; GFX8-NEXT:    s_and_b32 s4, s4, s18
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, s5, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, s6, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, s7, s18
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, s8, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, s9, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, s10, s18
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX8-NEXT:    s_and_b32 s2, s2, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, s11, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, s12, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, s13, s18
+; GFX8-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s3, 24
+; GFX8-NEXT:    s_and_b32 s3, s3, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s4, s14, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s4, s15, s18
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s18, 0xff
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX7-NEXT:    s_and_b32 s6, s6, s18
+; GFX7-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s6
+; GFX7-NEXT:    s_and_b32 s6, s7, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s6
+; GFX7-NEXT:    s_and_b32 s6, s8, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s6
+; GFX7-NEXT:    s_and_b32 s6, s9, s18
+; GFX7-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX7-NEXT:    s_and_b32 s1, s1, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s6
+; GFX7-NEXT:    s_and_b32 s6, s10, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX7-NEXT:    s_or_b32 s1, s1, s6
+; GFX7-NEXT:    s_and_b32 s6, s11, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s6
+; GFX7-NEXT:    s_and_b32 s6, s12, s18
+; GFX7-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX7-NEXT:    s_and_b32 s2, s2, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_and_b32 s6, s13, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_and_b32 s6, s14, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_and_b32 s6, s15, s18
+; GFX7-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX7-NEXT:    s_and_b32 s3, s3, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_or_b32 s3, s3, s6
+; GFX7-NEXT:    s_and_b32 s6, s16, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX7-NEXT:    s_or_b32 s3, s3, s6
+; GFX7-NEXT:    s_and_b32 s6, s17, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_or_b32 s3, s3, s6
+; GFX7-NEXT:    s_lshr_b32 s6, s5, 2
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX7-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX7-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX7-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX7-NEXT:    s_and_b32 s5, s5, 3
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX7-NEXT:    s_and_b32 s4, s4, s18
+; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX7-NEXT:    s_lshl_b32 s5, s18, s5
+; GFX7-NEXT:    s_andn2_b32 s5, s7, s5
+; GFX7-NEXT:    s_or_b32 s4, s5, s4
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX7-NEXT:    s_cselect_b32 s5, s4, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX7-NEXT:    s_cselect_b32 s7, s4, s1
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX7-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX7-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX7-NEXT:    s_lshr_b32 s4, s5, 8
+; GFX7-NEXT:    s_and_b32 s4, s4, s18
+; GFX7-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX7-NEXT:    s_lshr_b32 s8, s5, 24
+; GFX7-NEXT:    s_and_b32 s5, s5, s18
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX7-NEXT:    s_or_b32 s4, s5, s4
+; GFX7-NEXT:    s_and_b32 s5, s6, s18
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX7-NEXT:    s_lshr_b32 s9, s7, 8
+; GFX7-NEXT:    s_or_b32 s4, s4, s5
+; GFX7-NEXT:    s_and_b32 s5, s8, s18
+; GFX7-NEXT:    s_and_b32 s6, s9, s18
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_or_b32 s4, s4, s5
+; GFX7-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX7-NEXT:    s_and_b32 s5, s7, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
+; GFX7-NEXT:    s_and_b32 s6, s10, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX7-NEXT:    s_lshr_b32 s11, s7, 24
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
+; GFX7-NEXT:    s_and_b32 s6, s11, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
+; GFX7-NEXT:    s_and_b32 s6, s12, s18
+; GFX7-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX7-NEXT:    s_and_b32 s2, s2, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_and_b32 s6, s13, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_and_b32 s6, s14, s18
+; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX7-NEXT:    s_or_b32 s6, s2, s6
+; GFX7-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX7-NEXT:    s_and_b32 s2, s3, s18
+; GFX7-NEXT:    s_and_b32 s3, s15, s18
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s3
+; GFX7-NEXT:    s_and_b32 s3, s16, s18
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX7-NEXT:    s_or_b32 s2, s2, s3
+; GFX7-NEXT:    s_and_b32 s3, s17, s18
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX7-NEXT:    s_or_b32 s7, s2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v16i8_s_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    s_and_b32 s1, s3, 3
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 2
+; GFX9-NEXT:    s_and_b32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s1, s6, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX9-NEXT:    s_not_b32 s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v6
+; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v14, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v11
+; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v13
+; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v9
+; GFX9-NEXT:    v_or3_b32 v2, v2, v14, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX9-NEXT:    v_or3_b32 v3, v3, v16, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s5, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v9, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v5
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v6
+; GFX9-NEXT:    v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v7
+; GFX9-NEXT:    v_and_b32_sdwa v15, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX9-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX9-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GFX9-NEXT:    v_or3_b32 v3, v3, v15, v16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i8_s_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, s0
+; GFX8-NEXT:    s_and_b32 s1, s3, 3
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 2
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    s_and_b32 s2, s2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s5, s2, s1
+; GFX8-NEXT:    s_not_b32 s6, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX8-NEXT:    v_and_b32_sdwa v15, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX8-NEXT:    v_and_b32_sdwa v17, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v18, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, s5, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v12, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v14
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i8_s_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s6, 0xff
+; GFX7-NEXT:    s_and_b32 s0, s3, 3
+; GFX7-NEXT:    s_lshr_b32 s4, s3, 2
+; GFX7-NEXT:    s_and_b32 s1, s2, s6
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    s_lshl_b32 s5, s1, s0
+; GFX7-NEXT:    s_lshl_b32 s0, s6, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX7-NEXT:    s_not_b32 s7, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX7-NEXT:    v_and_b32_e32 v8, s6, v8
+; GFX7-NEXT:    v_and_b32_e32 v10, s6, v10
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX7-NEXT:    v_and_b32_e32 v9, s6, v9
+; GFX7-NEXT:    v_and_b32_e32 v11, s6, v11
+; GFX7-NEXT:    v_and_b32_e32 v13, s6, v13
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v12, s6, v12
+; GFX7-NEXT:    v_and_b32_e32 v14, s6, v14
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX7-NEXT:    v_and_b32_e32 v15, s6, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v12
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_s_v16i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s18, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v0, s18, v0
+; GFX9-NEXT:    s_mov_b32 s5, 8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX9-NEXT:    s_and_b32 s6, s6, s18
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s7, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s8, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s9, s18
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s6, s10, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s6, s11, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s6, s12, s18
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX9-NEXT:    s_and_b32 s2, s2, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s13, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s14, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s15, s18
+; GFX9-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX9-NEXT:    s_and_b32 s3, s3, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_and_b32 s6, s16, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_and_b32 s6, s17, s18
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 2
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b32 s7, s1, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX9-NEXT:    s_cselect_b32 s7, s2, s7
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
+; GFX9-NEXT:    s_and_b32 s4, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s8, s18, s4
+; GFX9-NEXT:    s_andn2_b32 s7, s7, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s18, v4
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v4, v8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s18, v5
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v5, v8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s18, v5
+; GFX9-NEXT:    v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v2, v5, v6, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v3, s18, v4
+; GFX9-NEXT:    v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v3, v4, v5, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s17, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 8
+; GFX8-NEXT:    s_and_b32 s5, s5, s17
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    s_and_b32 s5, s6, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    s_and_b32 s5, s7, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    s_and_b32 s5, s8, s17
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s5
+; GFX8-NEXT:    s_and_b32 s5, s9, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s5
+; GFX8-NEXT:    s_and_b32 s5, s10, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 8
+; GFX8-NEXT:    s_or_b32 s1, s1, s5
+; GFX8-NEXT:    s_and_b32 s5, s11, s17
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s2, 24
+; GFX8-NEXT:    s_and_b32 s2, s2, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_or_b32 s2, s2, s5
+; GFX8-NEXT:    s_and_b32 s5, s12, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX8-NEXT:    s_or_b32 s2, s2, s5
+; GFX8-NEXT:    s_and_b32 s5, s13, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX8-NEXT:    s_or_b32 s2, s2, s5
+; GFX8-NEXT:    s_and_b32 s5, s14, s17
+; GFX8-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s16, s3, 24
+; GFX8-NEXT:    s_and_b32 s3, s3, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_or_b32 s3, s3, s5
+; GFX8-NEXT:    s_and_b32 s5, s15, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s5
+; GFX8-NEXT:    s_and_b32 s5, s16, s17
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_or_b32 s3, s3, s5
+; GFX8-NEXT:    s_lshr_b32 s5, s4, 2
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX8-NEXT:    s_cselect_b32 s6, s1, s0
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX8-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX8-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX8-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX8-NEXT:    s_and_b32 s4, s4, 3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    s_lshl_b32 s4, s17, s4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX8-NEXT:    v_or_b32_e32 v4, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s17
+; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s17, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, s17, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s5, s0, 8
+; GFX7-NEXT:    s_and_b32 s5, s5, s17
+; GFX7-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s5
+; GFX7-NEXT:    s_and_b32 s5, s6, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s5
+; GFX7-NEXT:    s_and_b32 s5, s7, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s8, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s5
+; GFX7-NEXT:    s_and_b32 s5, s8, s17
+; GFX7-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s10, s1, 24
+; GFX7-NEXT:    s_and_b32 s1, s1, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s5
+; GFX7-NEXT:    s_and_b32 s5, s9, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX7-NEXT:    s_or_b32 s1, s1, s5
+; GFX7-NEXT:    s_and_b32 s5, s10, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s11, s2, 8
+; GFX7-NEXT:    s_or_b32 s1, s1, s5
+; GFX7-NEXT:    s_and_b32 s5, s11, s17
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s13, s2, 24
+; GFX7-NEXT:    s_and_b32 s2, s2, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s5
+; GFX7-NEXT:    s_and_b32 s5, s12, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX7-NEXT:    s_or_b32 s2, s2, s5
+; GFX7-NEXT:    s_and_b32 s5, s13, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX7-NEXT:    s_or_b32 s2, s2, s5
+; GFX7-NEXT:    s_and_b32 s5, s14, s17
+; GFX7-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s16, s3, 24
+; GFX7-NEXT:    s_and_b32 s3, s3, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_or_b32 s3, s3, s5
+; GFX7-NEXT:    s_and_b32 s5, s15, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX7-NEXT:    s_or_b32 s3, s3, s5
+; GFX7-NEXT:    s_and_b32 s5, s16, s17
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_or_b32 s3, s3, s5
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 2
+; GFX7-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX7-NEXT:    s_cselect_b32 s6, s1, s0
+; GFX7-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX7-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX7-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX7-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX7-NEXT:    s_and_b32 s4, s4, 3
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX7-NEXT:    s_lshl_b32 s4, s17, s4
+; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s17, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s17, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s17, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s17, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s17, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v16i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s18, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 8
+; GFX9-NEXT:    s_and_b32 s5, s5, s18
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s18
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s5
+; GFX9-NEXT:    s_and_b32 s5, s7, s18
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s5
+; GFX9-NEXT:    s_and_b32 s5, s8, s18
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX9-NEXT:    s_or_b32 s8, s0, s5
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX9-NEXT:    s_and_b32 s0, s1, s18
+; GFX9-NEXT:    s_and_b32 s1, s9, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s10, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s11, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX9-NEXT:    s_or_b32 s9, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s12, s18
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX9-NEXT:    s_and_b32 s0, s2, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s13, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s14, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX9-NEXT:    s_or_b32 s10, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s15, s18
+; GFX9-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX9-NEXT:    s_and_b32 s0, s3, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s16, s18
+; GFX9-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s17, s18
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    s_or_b32 s11, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_and_b32 s4, s4, s18
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s18
+; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_or_b32 v5, v1, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s18, v4
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v4, v8, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s18, v5
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v5, v8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s18, v5
+; GFX9-NEXT:    v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v2, v5, v6, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v3, s18, v4
+; GFX9-NEXT:    v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v3, v4, v5, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s18, 0xff
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 8
+; GFX8-NEXT:    s_and_b32 s5, s5, s18
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s18
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    s_and_b32 s5, s6, s18
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    s_and_b32 s5, s7, s18
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX8-NEXT:    s_or_b32 s8, s0, s5
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX8-NEXT:    s_and_b32 s0, s1, s18
+; GFX8-NEXT:    s_and_b32 s1, s9, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s10, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s11, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX8-NEXT:    s_or_b32 s9, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s12, s18
+; GFX8-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX8-NEXT:    s_and_b32 s0, s2, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s13, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s14, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX8-NEXT:    s_or_b32 s10, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s15, s18
+; GFX8-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX8-NEXT:    s_and_b32 s0, s3, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s16, s18
+; GFX8-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s17, s18
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    s_or_b32 s11, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_and_b32 s4, s4, s18
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v5, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s18
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s18, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s5, s0, 8
+; GFX7-NEXT:    s_and_b32 s5, s5, s18
+; GFX7-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s18
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s5
+; GFX7-NEXT:    s_and_b32 s5, s6, s18
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s5
+; GFX7-NEXT:    s_and_b32 s5, s7, s18
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX7-NEXT:    s_or_b32 s8, s0, s5
+; GFX7-NEXT:    s_lshr_b32 s10, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s11, s1, 24
+; GFX7-NEXT:    s_and_b32 s0, s1, s18
+; GFX7-NEXT:    s_and_b32 s1, s9, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s10, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s11, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX7-NEXT:    s_or_b32 s9, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s12, s18
+; GFX7-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX7-NEXT:    s_and_b32 s0, s2, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s13, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s14, s2, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s14, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshr_b32 s15, s3, 8
+; GFX7-NEXT:    s_or_b32 s10, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s15, s18
+; GFX7-NEXT:    s_lshr_b32 s16, s3, 16
+; GFX7-NEXT:    s_and_b32 s0, s3, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s16, s18
+; GFX7-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s17, s18
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    s_or_b32 s11, s0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s10
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    s_and_b32 s4, s4, s18
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s18, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s11
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    v_mov_b32_e32 v3, s11
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s18, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s18, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s18, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s18, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s18, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_s_v16i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-NEXT:    s_movk_i32 s17, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 8
+; GFX9-NEXT:    s_and_b32 s4, s4, s17
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s17
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_and_b32 s4, s5, s17
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_and_b32 s4, s6, s17
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX9-NEXT:    s_or_b32 s4, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 24
+; GFX9-NEXT:    s_and_b32 s0, s1, s17
+; GFX9-NEXT:    s_and_b32 s1, s7, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s9, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s10, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 8
+; GFX9-NEXT:    s_or_b32 s5, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s11, s17
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX9-NEXT:    s_and_b32 s0, s2, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s12, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s13, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX9-NEXT:    s_or_b32 s6, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s14, s17
+; GFX9-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX9-NEXT:    s_and_b32 s0, s3, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s15, s17
+; GFX9-NEXT:    s_lshr_b32 s16, s3, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s16, s17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    s_or_b32 s7, s0, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_and_or_b32 v5, v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s8, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s17, v4
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v4, v8, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s17, v5
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v5, v8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s17, v5
+; GFX9-NEXT:    v_and_b32_sdwa v6, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v2, v5, v6, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v3, s17, v4
+; GFX9-NEXT:    v_and_b32_sdwa v5, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v3, v4, v5, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_s_v16i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX8-NEXT:    s_movk_i32 s16, 0xff
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 8
+; GFX8-NEXT:    s_and_b32 s4, s4, s16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s16
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, s5, s16
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, s6, s16
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX8-NEXT:    s_or_b32 s4, s0, s4
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX8-NEXT:    s_and_b32 s0, s1, s16
+; GFX8-NEXT:    s_and_b32 s1, s7, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s8, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s9, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX8-NEXT:    s_or_b32 s5, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s10, s16
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX8-NEXT:    s_and_b32 s0, s2, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s11, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s12, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX8-NEXT:    s_or_b32 s6, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s13, s16
+; GFX8-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX8-NEXT:    s_and_b32 s0, s3, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s14, s16
+; GFX8-NEXT:    s_lshr_b32 s15, s3, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s15, s16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_or_b32 s7, s0, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s16
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s16
+; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_s_v16i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX7-NEXT:    s_movk_i32 s16, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s4, s0, 8
+; GFX7-NEXT:    s_and_b32 s4, s4, s16
+; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX7-NEXT:    s_and_b32 s0, s0, s16
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s4
+; GFX7-NEXT:    s_and_b32 s4, s5, s16
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s4
+; GFX7-NEXT:    s_and_b32 s4, s6, s16
+; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX7-NEXT:    s_or_b32 s4, s0, s4
+; GFX7-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX7-NEXT:    s_and_b32 s0, s1, s16
+; GFX7-NEXT:    s_and_b32 s1, s7, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s8, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s9, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX7-NEXT:    s_or_b32 s5, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s10, s16
+; GFX7-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX7-NEXT:    s_and_b32 s0, s2, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s11, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s12, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX7-NEXT:    s_or_b32 s6, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s13, s16
+; GFX7-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX7-NEXT:    s_and_b32 s0, s3, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s14, s16
+; GFX7-NEXT:    s_lshr_b32 s15, s3, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s1, s15, s16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_or_b32 s7, s0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s16, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s16, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v16i8_s_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 2, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    s_and_b32 s1, s2, s6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v7, v2, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
+; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v8
+; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v9
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v10
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v3, v3, v12, v13
+; GFX9-NEXT:    v_or3_b32 v4, v4, v14, v15
+; GFX9-NEXT:    v_and_b32_sdwa v18, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v19, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v6, v6, s6, v11
+; GFX9-NEXT:    v_or3_b32 v5, v5, v16, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v19
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v2, v8, v2, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s6, v0
+; GFX9-NEXT:    v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v7
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
+; GFX9-NEXT:    v_or3_b32 v0, v1, v9, v10
+; GFX9-NEXT:    v_or3_b32 v1, v3, v11, v12
+; GFX9-NEXT:    v_or3_b32 v2, v4, v13, v14
+; GFX9-NEXT:    v_or3_b32 v3, v5, v15, v16
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i8_s_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, s0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    s_and_b32 s1, s2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT:    v_lshlrev_b32_e64 v9, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v14, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_and_b32_sdwa v17, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX8-NEXT:    v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v19, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v18
+; GFX8-NEXT:    v_and_b32_sdwa v6, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v2, v6, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v12
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v14
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v16
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v13
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i8_s_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s6, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 2, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT:    s_and_b32 s0, s2, s6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX7-NEXT:    v_lshl_b32_e32 v18, s0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v17
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v17
+; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_and_b32_e32 v8, s6, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_and_b32_e32 v9, s6, v9
+; GFX7-NEXT:    v_and_b32_e32 v11, s6, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
+; GFX7-NEXT:    v_and_b32_e32 v10, s6, v10
+; GFX7-NEXT:    v_and_b32_e32 v12, s6, v12
+; GFX7-NEXT:    v_and_b32_e32 v14, s6, v14
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, s6, v13
+; GFX7-NEXT:    v_and_b32_e32 v15, s6, v15
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_and_b32_e32 v16, s6, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v14
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v13
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v16
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v18
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) {
+; GFX9-LABEL: insertelement_v_v16i8_v_s:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    s_and_b32 s1, s2, 3
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
+; GFX9-NEXT:    s_lshl_b32 s1, s6, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX9-NEXT:    s_not_b32 s5, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
+; GFX9-NEXT:    v_and_b32_sdwa v10, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX9-NEXT:    v_and_b32_sdwa v12, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v4, s6, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v14, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v2, v2, v10, v11
+; GFX9-NEXT:    v_or3_b32 v3, v3, v12, v13
+; GFX9-NEXT:    v_and_b32_sdwa v16, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s6, v9
+; GFX9-NEXT:    v_or3_b32 v4, v4, v14, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX9-NEXT:    v_or3_b32 v5, v5, v16, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v1, v6, s5, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v9, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v10, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s6, v0
+; GFX9-NEXT:    v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v16, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v7
+; GFX9-NEXT:    v_or3_b32 v0, v2, v9, v10
+; GFX9-NEXT:    v_or3_b32 v1, v3, v11, v12
+; GFX9-NEXT:    v_or3_b32 v2, v4, v13, v14
+; GFX9-NEXT:    v_or3_b32 v3, v5, v15, v16
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i8_v_s:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    s_and_b32 s1, s2, 3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-NEXT:    v_mov_b32_e32 v8, s1
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, s0
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX8-NEXT:    s_not_b32 s5, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX8-NEXT:    v_and_b32_sdwa v16, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v17, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v16
+; GFX8-NEXT:    v_and_b32_sdwa v18, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v18
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v19
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v6, s5, v6
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v12
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v14
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v16
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v13
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i8_v_s:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s6, 0xff
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v2
+; GFX7-NEXT:    s_and_b32 s0, s2, 3
+; GFX7-NEXT:    s_lshr_b32 s4, s2, 2
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_lshl_b32 s0, s6, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; GFX7-NEXT:    s_not_b32 s5, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 8, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, s6, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, s6, v9
+; GFX7-NEXT:    v_and_b32_e32 v11, s6, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
+; GFX7-NEXT:    v_and_b32_e32 v10, s6, v10
+; GFX7-NEXT:    v_and_b32_e32 v12, s6, v12
+; GFX7-NEXT:    v_and_b32_e32 v14, s6, v14
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, s6, v13
+; GFX7-NEXT:    v_and_b32_e32 v15, s6, v15
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_and_b32_e32 v16, s6, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v14
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v13
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v16
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v5, s5, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[2:3]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s6, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}
+
+define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) {
+; GFX9-LABEL: insertelement_v_v16i8_v_v:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_movk_i32 s1, 0xff
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s1, v9
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v15, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s1, v10
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v17, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v18, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v6, v6, s1, v11
+; GFX9-NEXT:    v_and_b32_sdwa v19, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v9, v7, v0, v12
+; GFX9-NEXT:    v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v4, v4, v13, v14
+; GFX9-NEXT:    v_or3_b32 v5, v5, v15, v16
+; GFX9-NEXT:    v_or3_b32 v7, v9, v19, v7
+; GFX9-NEXT:    v_or3_b32 v6, v6, v17, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v7, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v2, v9, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v13, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v4, v4, v0, v7
+; GFX9-NEXT:    v_and_or_b32 v5, v5, v0, v8
+; GFX9-NEXT:    v_and_b32_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v6
+; GFX9-NEXT:    v_and_or_b32 v6, v2, v0, v1
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v1, v4, v12, v13
+; GFX9-NEXT:    v_or3_b32 v2, v5, v14, v15
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_or3_b32 v0, v3, v10, v11
+; GFX9-NEXT:    v_or3_b32 v3, v6, v16, v17
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX8-LABEL: insertelement_v_v16i8_v_v:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    v_mov_b32_e32 v9, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 2, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v10
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v10
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v7
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v17, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_and_b32_sdwa v9, v5, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX8-NEXT:    v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v18
+; GFX8-NEXT:    v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX8-NEXT:    v_or_b32_e32 v4, v2, v16
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v17
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX7-LABEL: insertelement_v_v16i8_v_v:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v19
+; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_and_b32_e32 v10, s0, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 8, v7
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v11, s0, v11
+; GFX7-NEXT:    v_and_b32_e32 v13, v13, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX7-NEXT:    v_and_b32_e32 v12, s0, v12
+; GFX7-NEXT:    v_and_b32_e32 v14, v14, v8
+; GFX7-NEXT:    v_and_b32_e32 v16, v16, v8
+; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 24, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, v15, v8
+; GFX7-NEXT:    v_and_b32_e32 v17, v17, v8
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_and_b32_e32 v7, v7, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX7-NEXT:    v_or_b32_e32 v1, v4, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX7-NEXT:    v_and_b32_e32 v18, v18, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_or_b32_e32 v6, v7, v16
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v17
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 24, v18
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v15
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v18
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v3, v6, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v5, v2, s[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, v6, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, v7, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v1, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, v9, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, v10, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, v3, v8
+; GFX7-NEXT:    v_and_b32_e32 v3, v11, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, v12, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, v13, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, v14, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, v15, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, v16, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+  %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
+  store <16 x i8> %insert, <16 x i8> addrspace(1)* null
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
index 58610f484f6b..3f3ec6216585 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
@@ -1738,3 +1738,195 @@ body: |
     %5:_(p1) = COPY $vgpr0_vgpr1
     G_STORE %4, %5 :: (store 256, align 4, addrspace 1)
 ...
+
+---
+name: insert_vector_elt_varidx_v4s8
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-LABEL: name: insert_vector_elt_varidx_v4s8
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C4]]
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[SHL3]](s32)
+    ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[SHL3]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL5]], [[C5]]
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[XOR]]
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL4]]
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C]](s32)
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C1]](s32)
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C2]](s32)
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]]
+    ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL6]]
+    ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C1]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL7]]
+    ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
+    ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
+    ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C2]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL8]]
+    ; CHECK: $vgpr0 = COPY [[OR6]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(<4 x s8>) = G_BITCAST %0
+    %4:_(s8) = G_TRUNC %1
+    %5:_(<4 x s8>) = G_INSERT_VECTOR_ELT %3, %4, %2
+    %6:_(s32) = G_BITCAST %5
+    $vgpr0 = COPY %6
+...
+
+---
+name: insert_vector_elt_varidx_v8s8
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; CHECK-LABEL: name: insert_vector_elt_varidx_v8s8
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16)
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16)
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C3]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16)
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C5]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[LSHR6]](s32)
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]]
+    ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C6]](s32)
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[SHL6]](s32)
+    ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[SHL6]](s32)
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL8]], [[C7]]
+    ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[EVEC]], [[XOR]]
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL7]]
+    ; CHECK: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[OR6]](s32), [[LSHR6]](s32)
+    ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC]](<2 x s32>)
+    ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
+    ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32)
+    ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C4]](s32)
+    ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
+    ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
+    ; CHECK: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C4]](s32)
+    ; CHECK: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32)
+    ; CHECK: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C8]]
+    ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; CHECK: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C8]]
+    ; CHECK: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND12]], [[C1]](s16)
+    ; CHECK: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND11]], [[SHL9]]
+    ; CHECK: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
+    ; CHECK: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C8]]
+    ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+    ; CHECK: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C8]]
+    ; CHECK: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND14]], [[C1]](s16)
+    ; CHECK: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND13]], [[SHL10]]
+    ; CHECK: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32)
+    ; CHECK: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C8]]
+    ; CHECK: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32)
+    ; CHECK: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C8]]
+    ; CHECK: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND16]], [[C1]](s16)
+    ; CHECK: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND15]], [[SHL11]]
+    ; CHECK: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
+    ; CHECK: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C8]]
+    ; CHECK: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
+    ; CHECK: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C8]]
+    ; CHECK: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND18]], [[C1]](s16)
+    ; CHECK: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND17]], [[SHL12]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
+    ; CHECK: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+    ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL13]]
+    ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
+    ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
+    ; CHECK: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
+    ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL14]]
+    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR11]](s32), [[OR12]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(s32) = COPY $vgpr3
+    %3:_(<8 x s8>) = G_BITCAST %0
+    %4:_(s8) = G_TRUNC %1
+    %5:_(<8 x s8>) = G_INSERT_VECTOR_ELT %3, %4, %2
+    %6:_(s64) = G_BITCAST %5
+    $vgpr0_vgpr1 = COPY %6
+...


        


More information about the llvm-commits mailing list