[llvm] d2e66d7 - [GlobalISel] Add a combine for and(load , mask) -> zextload

Konstantin Schwarz via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 16 01:43:22 PDT 2021


Author: Konstantin Schwarz
Date: 2021-09-16T10:42:46+02:00
New Revision: d2e66d7fa46b14a749ff8686ecccf66292b7bc6b

URL: https://github.com/llvm/llvm-project/commit/d2e66d7fa46b14a749ff8686ecccf66292b7bc6b
DIFF: https://github.com/llvm/llvm-project/commit/d2e66d7fa46b14a749ff8686ecccf66292b7bc6b.diff

LOG: [GlobalISel] Add a combine for and(load , mask) -> zextload

This only handles simple masks, not shifted masks, for now.

Reviewed By: aemerson

Differential Revision: https://reviews.llvm.org/D109357

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 624f00cabcde5..5e3f3717952da 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -172,6 +172,9 @@ class CombinerHelper {
   bool matchCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
   void applyCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
 
+  /// Match (and (load x), mask) -> zextload x
+  bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   /// Combine \p MI into a pre-indexed or post-indexed load/store operation if
   /// legal and the surrounding code makes it useful.
   bool tryCombineIndexedLoadStore(MachineInstr &MI);

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 5697e1e592c09..09be6db7d46a6 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -130,7 +130,13 @@ def extending_loads : GICombineRule<
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD):$root,
          [{ return Helper.matchCombineExtendingLoads(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>;
-def combines_for_extload: GICombineGroup<[extending_loads]>;
+
+def load_and_mask : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_AND):$root,
+        [{ return Helper.matchCombineLoadWithAndMask(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def combines_for_extload: GICombineGroup<[extending_loads, load_and_mask]>;
 
 def sext_trunc_sextload : GICombineRule<
   (defs root:$d),

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4cd4e2de73941..26bea3ca5600a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -633,6 +633,76 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
+bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  // If we have the following code:
+  //  %mask = G_CONSTANT 255
+  //  %ld   = G_LOAD %ptr, (load s16)
+  //  %and  = G_AND %ld, %mask
+  //
+  // Try to fold it into
+  //   %ld = G_ZEXTLOAD %ptr, (load s8)
+
+  Register Dst = MI.getOperand(0).getReg();
+  if (MRI.getType(Dst).isVector())
+    return false;
+
+  auto MaybeMask =
+      getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeMask)
+    return false;
+
+  APInt MaskVal = MaybeMask->Value;
+
+  if (!MaskVal.isMask())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  GAnyLoad *LoadMI = getOpcodeDef<GAnyLoad>(SrcReg, MRI);
+  if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()) ||
+      !LoadMI->isSimple())
+    return false;
+
+  Register LoadReg = LoadMI->getDstReg();
+  LLT LoadTy = MRI.getType(LoadReg);
+  Register PtrReg = LoadMI->getPointerReg();
+  uint64_t LoadSizeBits = LoadMI->getMemSizeInBits();
+  unsigned MaskSizeBits = MaskVal.countTrailingOnes();
+
+  // The mask may not be larger than the in-memory type, as it might cover sign
+  // extended bits
+  if (MaskSizeBits > LoadSizeBits)
+    return false;
+
+  // If the mask covers the whole destination register, there's nothing to
+  // extend
+  if (MaskSizeBits >= LoadTy.getSizeInBits())
+    return false;
+
+  // Most targets cannot deal with loads of size < 8 and need to re-legalize to
+  // at least byte loads. Avoid creating such loads here
+  if (MaskSizeBits < 8 || !isPowerOf2_32(MaskSizeBits))
+    return false;
+
+  const MachineMemOperand &MMO = LoadMI->getMMO();
+  LegalityQuery::MemDesc MemDesc(MMO);
+  MemDesc.MemoryTy = LLT::scalar(MaskSizeBits);
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_ZEXTLOAD, {LoadTy, MRI.getType(PtrReg)}, {MemDesc}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.setInstrAndDebugLoc(*LoadMI);
+    auto &MF = B.getMF();
+    auto PtrInfo = MMO.getPointerInfo();
+    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MaskSizeBits / 8);
+    B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO);
+  };
+  return true;
+}
+
 bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
                                    const MachineInstr &UseMI) {
   assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir
new file mode 100644
index 0000000000000..00fefddff1af1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir
@@ -0,0 +1,252 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -aarch64prelegalizercombinerhelper-only-enable-rule="load_and_mask" -verify-machineinstrs %s -o - | FileCheck %s
+
+# REQUIRES: asserts
+
+# Check that we can fold and ({any,zext,sext}load, mask) -> zextload
+
+---
+name:            test_anyext_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_anyext_1
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[LOAD]], [[C]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s8)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s8) = G_CONSTANT i8 1
+    %2:_(s8) = G_LOAD %0 :: (load (s8))
+    %3:_(s8) = G_AND %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $w0 = COPY %4
+...
+
+---
+name:            test_anyext_s16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_anyext_s16
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s16) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXTLOAD]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(s16) = G_LOAD %0 :: (load (s8))
+    %3:_(s16) = G_AND %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $w0 = COPY %4
+...
+
+---
+name:            test_anyext_s32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_anyext_s32
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 255
+    %2:_(s32) = G_LOAD %0 :: (load (s8))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_load_s32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_load_s32
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 255
+    %2:_(s32) = G_LOAD %0 :: (load (s32))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+
+---
+name:            test_load_mask_size_equals_dst_size
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; The combine should only apply if the mask zeroes actual bits of the dst type
+    ; If it doesn't, the mask is redundant and we have other combines to fold it away
+
+    ; CHECK-LABEL: name: test_load_mask_size_equals_dst_size
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 4294967295
+    %2:_(s32) = G_LOAD %0 :: (load (s32))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_zext
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_zext
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 2)
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 255
+    %2:_(s32) = G_ZEXTLOAD %0 :: (load (s16))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_zext_mask_larger_memsize
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; The combine should only apply if the mask narrows the memory size.
+    ; We have another combine that folds redundant masks
+
+    ; CHECK-LABEL: name: test_zext_mask_larger_memsize
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ZEXTLOAD]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 65535
+    %2:_(s32) = G_ZEXTLOAD %0 :: (load (s8))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_sext
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_sext
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 2)
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 255
+    %2:_(s32) = G_SEXTLOAD %0 :: (load (s16))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_sext_mask_larger_memsize
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_sext_mask_larger_memsize
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXTLOAD]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 65535
+    %2:_(s32) = G_SEXTLOAD %0 :: (load (s8))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_non_pow2_memtype
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_non_pow2_memtype
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s24) = G_CONSTANT i24 7
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[COPY]](p0) :: (load (s24), align 4)
+    ; CHECK: [[AND:%[0-9]+]]:_(s24) = G_AND [[LOAD]], [[C]]
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s24)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s24) = G_CONSTANT i24 7
+    %2:_(s24) = G_LOAD %0 :: (load (s24))
+    %3:_(s24) = G_AND %2, %1
+    %4:_(s32) = G_ANYEXT %3
+    $w0 = COPY %4
+...
+
+
+---
+name:            test_no_mask
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_no_mask
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 510
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 510
+    %2:_(s32) = G_LOAD %0 :: (load (s8))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...
+
+---
+name:            test_volatile
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_volatile
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (volatile load (s8))
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_CONSTANT i32 255
+    %2:_(s32) = G_LOAD %0 :: (volatile load (s8))
+    %3:_(s32) = G_AND %2, %1
+    $w0 = COPY %3
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 26d2c8e07a28e..8ec1cc5a8fa32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -462,7 +462,6 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -479,7 +478,7 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -538,22 +537,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 ; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
 ; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT:    s_movk_i32 s0, 0xff
+; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v1, s0, v2
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, s0, v3
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, s0, v4
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, s0, v0
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v5
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -580,13 +574,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -784,22 +778,17 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
 ; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
 ; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT:    s_movk_i32 s0, 0xff
+; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v1, s0, v2
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, s0, v3
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, s0, v4
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, s0, v0
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v5
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -826,13 +815,13 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -854,11 +843,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -873,9 +861,9 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index cfd2236b817ab..54c0ba572053b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -30,40 +30,27 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[0:1], off offset:9
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[0:1], off offset:10
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v13, v[0:1], off offset:11
-; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s5, 8
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 8, v2
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v2, s4, v3
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v7, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v9, v9, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v10
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, v12, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v12, v13, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v5, v6, s4, v7
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v8, v10, v0, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v10, 24, v12
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v4
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v5, v6, v7
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v8, v9, v10
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v13
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v6, v7, v8
 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -94,40 +81,23 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
-; GFX7-NOUNALIGNED-NEXT:    s_movk_i32 s4, 0xff
-; GFX7-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, v7, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v9, v9, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v10, v10, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, v11, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v12, v12, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 8, v11
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v2, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v6, v5
@@ -160,19 +130,12 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[0:1], off offset:6
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v[0:1], off offset:8
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v7, v[0:1], off offset:10
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s4, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s4, v5
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s4, v7
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v4, s4, v1
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v6, s4, v3
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
@@ -197,25 +160,15 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s4, v4
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s4, v6
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v1, v2
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v3, v4
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v2, v1
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2
   ret <3 x i32> %load
@@ -405,43 +358,30 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:9
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:10
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:11
-; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s1, 8
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s0, v3
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v1, s0, v2
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, v7, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v10, 8, v9
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, v11, v0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v12, v12, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v4, v5, s0, v6
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v9, v0, v10
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v4, v5, v6
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v7, v8
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v6, v7, v8
+; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -471,41 +411,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0 offset:9
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:10
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:11
-; GFX7-NOUNALIGNED-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, 0xff
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, v5, v12
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v6, v6, v12
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v12
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v5
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v9, v9, v12
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v10, v10, v12
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, v7, v12
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, v11, v12
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
@@ -541,21 +466,14 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v0, s[0:1] offset:6
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v0, s[0:1] offset:8
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v0, s[0:1] offset:10
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v2
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v4
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v6
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v1, s0, v0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v3, s0, v2
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v5, s0, v4
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
 ;
@@ -580,24 +498,14 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:6
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:8
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:10
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v4, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index 91100f2c405da..810447258f3cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -39,141 +39,106 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX9-NEXT:    ds_read_u8 v4, v0 offset:2
-; GFX9-NEXT:    ds_read_u8 v5, v0 offset:3
-; GFX9-NEXT:    ds_read_u8 v6, v0 offset:4
-; GFX9-NEXT:    ds_read_u8 v7, v0 offset:5
-; GFX9-NEXT:    ds_read_u8 v8, v0 offset:6
-; GFX9-NEXT:    ds_read_u8 v9, v0 offset:7
-; GFX9-NEXT:    s_mov_b32 s5, 8
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v4, v1, v2, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
+; GFX9-NEXT:    v_or3_b32 v4, v1, v2, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v2, v8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, v9, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v6, s4, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_or3_b32 v1, v1, v2, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:8
-; GFX9-NEXT:    ds_read_u8 v6, v0 offset:9
-; GFX9-NEXT:    ds_read_u8 v7, v0 offset:10
-; GFX9-NEXT:    ds_read_u8 v8, v0 offset:11
-; GFX9-NEXT:    ds_read_u8 v9, v0 offset:12
-; GFX9-NEXT:    ds_read_u8 v10, v0 offset:13
-; GFX9-NEXT:    ds_read_u8 v11, v0 offset:14
+; GFX9-NEXT:    ds_read_u8 v3, v0 offset:9
+; GFX9-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX9-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX9-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX9-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX9-NEXT:    ds_read_u8 v9, v0 offset:14
 ; GFX9-NEXT:    ds_read_u8 v0, v0 offset:15
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v3, v6
+; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX9-NEXT:    v_and_b32_e32 v6, v7, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v7, v8, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_or3_b32 v2, v2, v6, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX9-NEXT:    v_or3_b32 v2, v2, v3, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 8, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v6, v11, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX9-NEXT:    v_and_or_b32 v5, v9, v3, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX9-NEXT:    v_or3_b32 v3, v5, v6, v0
+; GFX9-NEXT:    v_or3_b32 v3, v3, v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v2, v7, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v2, v8, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, v9, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:11
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:13
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:14
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v5, v5, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v5, v6, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v5, v7, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v6, v9, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, v10, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -181,63 +146,45 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT:    ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT:    ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT:    ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v8, v0
-; GFX10-NEXT:    ds_read_u8 v9, v0 offset:4
-; GFX10-NEXT:    ds_read_u8 v10, v0 offset:8
-; GFX10-NEXT:    ds_read_u8 v12, v0 offset:10
-; GFX10-NEXT:    ds_read_u8 v13, v0 offset:11
-; GFX10-NEXT:    ds_read_u8 v14, v0 offset:12
-; GFX10-NEXT:    ds_read_u8 v15, v0 offset:13
-; GFX10-NEXT:    ds_read_u8 v16, v0 offset:14
+; GFX10-NEXT:    ds_read_u8 v1, v0
+; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
+; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
+; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
+; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
-; GFX10-NEXT:    v_mov_b32_e32 v17, 8
-; GFX10-NEXT:    s_mov_b32 s5, 8
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0xff
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    s_waitcnt lgkmcnt(15)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(13)
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(11)
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX10-NEXT:    v_and_b32_e32 v6, v6, v11
+; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_and_b32_e32 v8, v12, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
-; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v12, v16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
@@ -252,7 +199,6 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    ds_read_u16 v1, v0
 ; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
@@ -262,27 +208,18 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v1
+; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v6
+; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v2, v5, s4, v2
-; GFX9-NEXT:    v_and_or_b32 v3, v7, s4, v3
+; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
@@ -291,63 +228,40 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX7-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
-; GFX10-NEXT:    ds_read_u16 v2, v0 offset:6
-; GFX10-NEXT:    ds_read_u16 v3, v0 offset:10
-; GFX10-NEXT:    ds_read_u16 v4, v0 offset:14
-; GFX10-NEXT:    ds_read_u16 v5, v0
-; GFX10-NEXT:    ds_read_u16 v6, v0 offset:4
-; GFX10-NEXT:    ds_read_u16 v7, v0 offset:8
-; GFX10-NEXT:    ds_read_u16 v8, v0 offset:12
-; GFX10-NEXT:    s_mov_b32 s4, 0xffff
-; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX10-NEXT:    ds_read_u16 v1, v0
+; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
+; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
+; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
+; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
+; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
+; GFX10-NEXT:    ds_read_u16 v7, v0 offset:12
+; GFX10-NEXT:    ds_read_u16 v8, v0 offset:14
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX10-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_and_or_b32 v1, v6, s4, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_or_b32 v2, v7, s4, v2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v3, v8, s4, v3
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
   ret <4 x i32> %load

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index a79c9ebc618c0..fe6bf1504877c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -38,112 +38,83 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
-; GFX9-NEXT:    ds_read_u8 v3, v0 offset:1
-; GFX9-NEXT:    ds_read_u8 v4, v0 offset:2
-; GFX9-NEXT:    ds_read_u8 v5, v0 offset:3
-; GFX9-NEXT:    ds_read_u8 v6, v0 offset:4
-; GFX9-NEXT:    ds_read_u8 v7, v0 offset:5
-; GFX9-NEXT:    ds_read_u8 v8, v0 offset:6
-; GFX9-NEXT:    ds_read_u8 v9, v0 offset:7
-; GFX9-NEXT:    s_mov_b32 s5, 8
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v3, v1, v3, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
+; GFX9-NEXT:    v_or3_b32 v3, v1, v2, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v4, v8, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, v9, v2
-; GFX9-NEXT:    v_and_or_b32 v1, v6, s4, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_or3_b32 v1, v1, v4, v5
-; GFX9-NEXT:    ds_read_u8 v4, v0 offset:8
-; GFX9-NEXT:    ds_read_u8 v5, v0 offset:9
-; GFX9-NEXT:    ds_read_u8 v6, v0 offset:10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v2, v4
+; GFX9-NEXT:    ds_read_u8 v2, v0 offset:8
+; GFX9-NEXT:    ds_read_u8 v4, v0 offset:9
+; GFX9-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX9-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v4, v4, v2, v5
+; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 8, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v5, v6, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX9-NEXT:    v_or3_b32 v2, v4, v5, v0
+; GFX9-NEXT:    v_or3_b32 v2, v2, v4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    ds_read_u8 v1, v0
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v4, v7, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, v9, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX7-NEXT:    v_and_b32_e32 v4, v4, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v5, v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v5, v6, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -151,52 +122,36 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT:    ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT:    ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT:    ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v8, v0 offset:10
-; GFX10-NEXT:    ds_read_u8 v9, v0 offset:11
-; GFX10-NEXT:    ds_read_u8 v10, v0
-; GFX10-NEXT:    ds_read_u8 v11, v0 offset:4
-; GFX10-NEXT:    ds_read_u8 v0, v0 offset:8
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v13, 8
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    s_mov_b32 s5, 8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(11)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    ds_read_u8 v1, v0
+; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX10-NEXT:    v_and_b32_e32 v6, v6, v12
+; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v8, v8, v12
-; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v9, v9, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_and_or_b32 v1, v10, s4, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_or_b32 v4, v11, s4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
 ; GFX10-NEXT:    v_or3_b32 v2, v7, v8, v9
@@ -215,76 +170,50 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
 ; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v4
+; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v5, s4, v2
+; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
-; GFX10-NEXT:    ds_read_u16 v2, v0 offset:6
-; GFX10-NEXT:    ds_read_u16 v3, v0 offset:10
-; GFX10-NEXT:    ds_read_u16 v4, v0
-; GFX10-NEXT:    ds_read_u16 v5, v0 offset:4
-; GFX10-NEXT:    ds_read_u16 v6, v0 offset:8
-; GFX10-NEXT:    s_mov_b32 s4, 0xffff
-; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX10-NEXT:    ds_read_u16 v1, v0
+; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
+; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
+; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
+; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
+; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, v6, s4, v2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
   ret <3 x i32> %load

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 70a351ed65c40..b573142f5c4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -18,77 +18,58 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v2, v7, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v2, v8, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, v9, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:11
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:13
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:14
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v5, v5, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v5, v6, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v5, v7, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v6, v9, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v6, v10, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -96,63 +77,45 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT:    ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT:    ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT:    ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v8, v0
-; GFX10-NEXT:    ds_read_u8 v9, v0 offset:4
-; GFX10-NEXT:    ds_read_u8 v10, v0 offset:8
-; GFX10-NEXT:    ds_read_u8 v12, v0 offset:10
-; GFX10-NEXT:    ds_read_u8 v13, v0 offset:11
-; GFX10-NEXT:    ds_read_u8 v14, v0 offset:12
-; GFX10-NEXT:    ds_read_u8 v15, v0 offset:13
-; GFX10-NEXT:    ds_read_u8 v16, v0 offset:14
+; GFX10-NEXT:    ds_read_u8 v1, v0
+; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
+; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
+; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
+; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
-; GFX10-NEXT:    v_mov_b32_e32 v17, 8
-; GFX10-NEXT:    s_mov_b32 s5, 8
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0xff
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    s_waitcnt lgkmcnt(15)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(13)
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(11)
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX10-NEXT:    v_and_b32_e32 v6, v6, v11
+; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_and_b32_e32 v8, v12, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
-; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v12, v16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
@@ -174,61 +137,45 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    ds_read_u8 v1, v0
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v4, v7, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, v9, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX7-NEXT:    v_and_b32_e32 v4, v4, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v5, v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v5, v6, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v4, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -236,52 +183,36 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT:    ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT:    ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT:    ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v8, v0 offset:10
-; GFX10-NEXT:    ds_read_u8 v9, v0 offset:11
-; GFX10-NEXT:    ds_read_u8 v10, v0
-; GFX10-NEXT:    ds_read_u8 v11, v0 offset:4
-; GFX10-NEXT:    ds_read_u8 v0, v0 offset:8
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v13, 8
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    s_mov_b32 s5, 8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(11)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    ds_read_u8 v1, v0
+; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX10-NEXT:    v_and_b32_e32 v6, v6, v12
+; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v8, v8, v12
-; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v9, v9, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_and_or_b32 v1, v10, s4, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_or_b32 v4, v11, s4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
 ; GFX10-NEXT:    v_or3_b32 v2, v7, v8, v9

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
index 9c1ce9cff7fed..fc8a367c5491a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
@@ -12,8 +12,8 @@ body:             |
     ; CHECK-LABEL: name: remove_and_255_zextload
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: %ptr:_(p1) = COPY $vgpr0_vgpr1
-    ; CHECK: %load:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load (s8), addrspace 1)
-    ; CHECK: $vgpr0 = COPY %load(s32)
+    ; CHECK: %and:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load (s8), addrspace 1)
+    ; CHECK: $vgpr0 = COPY %and(s32)
     %ptr:_(p1) = COPY $vgpr0_vgpr1
     %load:_(s32) = G_ZEXTLOAD %ptr :: (load (s8), addrspace 1, align 1)
     %mask:_(s32) = G_CONSTANT i32 255

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir
new file mode 100644
index 0000000000000..a16fcf464e59e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir
@@ -0,0 +1,24 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
+
+# Post-legalizer should not generate illegal extending loads
+---
+name: zextload_from_load_and_mask
+tracksRegLiveness: true
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: zextload_from_load_and_mask
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
+    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; CHECK: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_CONSTANT i64 255
+    %2:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1)
+    %3:_(s64) = G_AND %2, %1
+    $vgpr0_vgpr1 = COPY %3
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 5199f033acf11..97591c7fbe1f9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -501,7 +501,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
 ; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
@@ -1393,7 +1393,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 19aec679d09ac..72bb5ea5e310c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -387,7 +387,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -945,7 +945,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 6bf77bc93b1c5..52a3555c129df 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -851,21 +851,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[4:5] offset:1
 ; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[4:5] offset:3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v2
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
@@ -1178,21 +1173,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[4:5] offset:1
 ; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[4:5] offset:3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v2
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
@@ -1304,21 +1294,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[4:5] offset:1
 ; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[4:5] offset:3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v2
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
@@ -1438,21 +1423,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[4:5] offset:1
 ; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[4:5] offset:3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v2
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 189715bfbb6cf..91d952269757a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -350,39 +350,30 @@ define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-GISEL-LABEL: ds12align1:
 ; ALIGNED-GISEL:       ; %bb.0:
 ; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT:    s_mov_b32 s3, 8
-; ALIGNED-GISEL-NEXT:    s_movk_i32 s2, 0xff
-; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v2
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v2 offset:1
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v2 offset:2
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v2 offset:3
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v2 offset:4
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v2 offset:5
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v2 offset:6
-; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v2 offset:7
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v2 offset:1
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v2 offset:2
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v2 offset:3
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v2 offset:4
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v2 offset:5
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v2 offset:6
+; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v2 offset:7
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v3, s3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; ALIGNED-GISEL-NEXT:    v_and_or_b32 v0, v0, s2, v3
+; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v3, s2, v4
+; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v4, s2, v5
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
+; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v3
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
-; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v4, v8, v1
+; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v1, v9, v1
-; ALIGNED-GISEL-NEXT:    v_and_or_b32 v3, v6, s2, v3
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v3, v4, v1
+; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
+; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v2 offset:8
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v2 offset:9
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v2 offset:10
@@ -453,34 +444,29 @@ define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-GISEL-LABEL: ds12align2:
 ; ALIGNED-GISEL:       ; %bb.0:
 ; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 0xffff
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
-; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
-; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
-; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
-; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
-; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:10
+; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:2
+; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:4
+; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:6
+; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:8
+; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:10
+; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v0, s2, v2
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v3, 16, v1
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v2, s2, v4
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; ALIGNED-GISEL-NEXT:    v_and_or_b32 v0, v1, s2, v0
-; ALIGNED-GISEL-NEXT:    v_and_or_b32 v1, v3, s2, v2
-; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; ALIGNED-GISEL-NEXT:    ds_write_b16 v3, v0
-; ALIGNED-GISEL-NEXT:    ds_write_b16 v3, v2 offset:2
+; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v0
+; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v3 offset:2
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; ALIGNED-GISEL-NEXT:    ds_write_b16 v3, v1 offset:4
-; ALIGNED-GISEL-NEXT:    ds_write_b16 v3, v0 offset:6
+; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v1 offset:4
+; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v0 offset:6
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-GISEL-NEXT:    ds_write_b16 v3, v5 offset:8
+; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v6 offset:8
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-GISEL-NEXT:    ds_write_b16 v3, v6 offset:10
+; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v7 offset:10
 ; ALIGNED-GISEL-NEXT:    s_endpgm
 ;
 ; UNALIGNED-LABEL: ds12align2:


        


More information about the llvm-commits mailing list