[llvm] d2e66d7 - [GlobalISel] Add a combine for and(load , mask) -> zextload
Konstantin Schwarz via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 16 01:43:22 PDT 2021
Author: Konstantin Schwarz
Date: 2021-09-16T10:42:46+02:00
New Revision: d2e66d7fa46b14a749ff8686ecccf66292b7bc6b
URL: https://github.com/llvm/llvm-project/commit/d2e66d7fa46b14a749ff8686ecccf66292b7bc6b
DIFF: https://github.com/llvm/llvm-project/commit/d2e66d7fa46b14a749ff8686ecccf66292b7bc6b.diff
LOG: [GlobalISel] Add a combine for and(load , mask) -> zextload
This only handles simple masks, not shifted masks, for now.
Reviewed By: aemerson
Differential Revision: https://reviews.llvm.org/D109357
Added:
llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
llvm/test/CodeGen/AMDGPU/ctlz.ll
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
llvm/test/CodeGen/AMDGPU/ds-alignment.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 624f00cabcde5..5e3f3717952da 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -172,6 +172,9 @@ class CombinerHelper {
bool matchCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
void applyCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
+ /// Match (and (load x), mask) -> zextload x
+ bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+
/// Combine \p MI into a pre-indexed or post-indexed load/store operation if
/// legal and the surrounding code makes it useful.
bool tryCombineIndexedLoadStore(MachineInstr &MI);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 5697e1e592c09..09be6db7d46a6 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -130,7 +130,13 @@ def extending_loads : GICombineRule<
(match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD):$root,
[{ return Helper.matchCombineExtendingLoads(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>;
-def combines_for_extload: GICombineGroup<[extending_loads]>;
+
+def load_and_mask : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_AND):$root,
+ [{ return Helper.matchCombineLoadWithAndMask(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def combines_for_extload: GICombineGroup<[extending_loads, load_and_mask]>;
def sext_trunc_sextload : GICombineRule<
(defs root:$d),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4cd4e2de73941..26bea3ca5600a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -633,6 +633,76 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
Observer.changedInstr(MI);
}
+bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+ // If we have the following code:
+ // %mask = G_CONSTANT 255
+ // %ld = G_LOAD %ptr, (load s16)
+ // %and = G_AND %ld, %mask
+ //
+ // Try to fold it into
+ // %ld = G_ZEXTLOAD %ptr, (load s8)
+
+ Register Dst = MI.getOperand(0).getReg();
+ if (MRI.getType(Dst).isVector())
+ return false;
+
+ auto MaybeMask =
+ getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+ if (!MaybeMask)
+ return false;
+
+ APInt MaskVal = MaybeMask->Value;
+
+ if (!MaskVal.isMask())
+ return false;
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ GAnyLoad *LoadMI = getOpcodeDef<GAnyLoad>(SrcReg, MRI);
+ if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()) ||
+ !LoadMI->isSimple())
+ return false;
+
+ Register LoadReg = LoadMI->getDstReg();
+ LLT LoadTy = MRI.getType(LoadReg);
+ Register PtrReg = LoadMI->getPointerReg();
+ uint64_t LoadSizeBits = LoadMI->getMemSizeInBits();
+ unsigned MaskSizeBits = MaskVal.countTrailingOnes();
+
+ // The mask may not be larger than the in-memory type, as it might cover sign
+ // extended bits
+ if (MaskSizeBits > LoadSizeBits)
+ return false;
+
+ // If the mask covers the whole destination register, there's nothing to
+ // extend
+ if (MaskSizeBits >= LoadTy.getSizeInBits())
+ return false;
+
+ // Most targets cannot deal with loads of size < 8 and need to re-legalize to
+ // at least byte loads. Avoid creating such loads here
+ if (MaskSizeBits < 8 || !isPowerOf2_32(MaskSizeBits))
+ return false;
+
+ const MachineMemOperand &MMO = LoadMI->getMMO();
+ LegalityQuery::MemDesc MemDesc(MMO);
+ MemDesc.MemoryTy = LLT::scalar(MaskSizeBits);
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_ZEXTLOAD, {LoadTy, MRI.getType(PtrReg)}, {MemDesc}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.setInstrAndDebugLoc(*LoadMI);
+ auto &MF = B.getMF();
+ auto PtrInfo = MMO.getPointerInfo();
+ auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MaskSizeBits / 8);
+ B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO);
+ };
+ return true;
+}
+
bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
const MachineInstr &UseMI) {
assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir
new file mode 100644
index 0000000000000..00fefddff1af1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir
@@ -0,0 +1,252 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -aarch64prelegalizercombinerhelper-only-enable-rule="load_and_mask" -verify-machineinstrs %s -o - | FileCheck %s
+
+# REQUIRES: asserts
+
+# Check that we can fold and ({any,zext,sext}load, mask) -> zextload
+
+---
+name: test_anyext_1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_anyext_1
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8))
+ ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[LOAD]], [[C]]
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s8)
+ ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s8) = G_CONSTANT i8 1
+ %2:_(s8) = G_LOAD %0 :: (load (s8))
+ %3:_(s8) = G_AND %2, %1
+ %4:_(s32) = G_ANYEXT %3
+ $w0 = COPY %4
+...
+
+---
+name: test_anyext_s16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_anyext_s16
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s16) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXTLOAD]](s16)
+ ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(s16) = G_LOAD %0 :: (load (s8))
+ %3:_(s16) = G_AND %2, %1
+ %4:_(s32) = G_ANYEXT %3
+ $w0 = COPY %4
+...
+
+---
+name: test_anyext_s32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_anyext_s32
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+ ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 255
+ %2:_(s32) = G_LOAD %0 :: (load (s8))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_load_s32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_load_s32
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 4)
+ ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 255
+ %2:_(s32) = G_LOAD %0 :: (load (s32))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+
+---
+name: test_load_mask_size_equals_dst_size
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+
+ ; The combine should only apply if the mask zeroes actual bits of the dst type
+ ; If it doesn't, the mask is redundant and we have other combines to fold it away
+
+ ; CHECK-LABEL: name: test_load_mask_size_equals_dst_size
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+ ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+ ; CHECK: $w0 = COPY [[AND]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 4294967295
+ %2:_(s32) = G_LOAD %0 :: (load (s32))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_zext
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_zext
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 2)
+ ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 255
+ %2:_(s32) = G_ZEXTLOAD %0 :: (load (s16))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_zext_mask_larger_memsize
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+
+ ; The combine should only apply if the mask narrows the memory size.
+ ; We have another combine that folds redundant masks
+
+ ; CHECK-LABEL: name: test_zext_mask_larger_memsize
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+ ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ZEXTLOAD]], [[C]]
+ ; CHECK: $w0 = COPY [[AND]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 65535
+ %2:_(s32) = G_ZEXTLOAD %0 :: (load (s8))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_sext
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_sext
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 2)
+ ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 255
+ %2:_(s32) = G_SEXTLOAD %0 :: (load (s16))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_sext_mask_larger_memsize
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_sext_mask_larger_memsize
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load (s8))
+ ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXTLOAD]], [[C]]
+ ; CHECK: $w0 = COPY [[AND]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 65535
+ %2:_(s32) = G_SEXTLOAD %0 :: (load (s8))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_non_pow2_memtype
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_non_pow2_memtype
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s24) = G_CONSTANT i24 7
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[COPY]](p0) :: (load (s24), align 4)
+ ; CHECK: [[AND:%[0-9]+]]:_(s24) = G_AND [[LOAD]], [[C]]
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s24)
+ ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s24) = G_CONSTANT i24 7
+ %2:_(s24) = G_LOAD %0 :: (load (s24))
+ %3:_(s24) = G_AND %2, %1
+ %4:_(s32) = G_ANYEXT %3
+ $w0 = COPY %4
+...
+
+
+---
+name: test_no_mask
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_no_mask
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 510
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+ ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+ ; CHECK: $w0 = COPY [[AND]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 510
+ %2:_(s32) = G_LOAD %0 :: (load (s8))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
+
+---
+name: test_volatile
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_volatile
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (volatile load (s8))
+ ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+ ; CHECK: $w0 = COPY [[AND]](s32)
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = G_CONSTANT i32 255
+ %2:_(s32) = G_LOAD %0 :: (volatile load (s8))
+ %3:_(s32) = G_AND %2, %1
+ $w0 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 26d2c8e07a28e..8ec1cc5a8fa32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -462,7 +462,6 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -479,7 +478,7 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -538,22 +537,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT: s_movk_i32 s0, 0xff
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v1, s0, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v2, s0, v3
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v3, s0, v4
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, s0, v0
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -580,13 +574,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -784,22 +778,17 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT: s_movk_i32 s0, 0xff
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v1, s0, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v2, s0, v3
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v3, s0, v4
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, s0, v0
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -826,13 +815,13 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -854,11 +843,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -873,9 +861,9 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index cfd2236b817ab..54c0ba572053b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -30,40 +30,27 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:9
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:10
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[0:1], off offset:11
-; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff
-; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s4, v3
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v3, v4, v5
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v12, v0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v12
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v13, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, s4, v7
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v10, v0, v1
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v11
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v12
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v4
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v10
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v13
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v6, v7, v8
; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -94,40 +81,23 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
-; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
-; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v6
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v1
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v7
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v9
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v11
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v12, 24, v0
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v5
@@ -160,19 +130,12 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[0:1], off offset:6
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[0:1], off offset:8
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[0:1], off offset:10
-; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v3
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v5
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v7
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v2, s4, v0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s4, v1
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, s4, v3
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
@@ -197,25 +160,15 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10
-; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v3
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v6, v5
; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2
ret <3 x i32> %load
@@ -405,43 +358,30 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:11
-; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff
-; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 8, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v6, 8, v5
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v8
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v3, v4, v5
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v9
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v0
-; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v9, v0, v10
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v11
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v1, v2, v3
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v4, v5, v6
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v7, v8
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v6, v7, v8
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -471,41 +411,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11
-; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
-; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v12
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v12
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6
@@ -541,21 +466,14 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] offset:10
-; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v2
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v4
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v6
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v2
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v4
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
;
@@ -580,24 +498,14 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10
-; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5
; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index 91100f2c405da..810447258f3cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -39,141 +39,106 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_u8 v1, v0
; GFX9-NEXT: ds_read_u8 v2, v0 offset:1
-; GFX9-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX9-NEXT: ds_read_u8 v5, v0 offset:3
-; GFX9-NEXT: ds_read_u8 v6, v0 offset:4
-; GFX9-NEXT: ds_read_u8 v7, v0 offset:5
-; GFX9-NEXT: ds_read_u8 v8, v0 offset:6
-; GFX9-NEXT: ds_read_u8 v9, v0 offset:7
-; GFX9-NEXT: s_mov_b32 s5, 8
-; GFX9-NEXT: s_movk_i32 s4, 0xff
+; GFX9-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX9-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX9-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX9-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX9-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX9-NEXT: ds_read_u8 v8, v0 offset:7
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(5)
-; GFX9-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
-; GFX9-NEXT: v_and_b32_e32 v4, s4, v5
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xff
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT: v_or3_b32 v4, v1, v2, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GFX9-NEXT: v_or3_b32 v4, v1, v2, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v2, v8, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v5, v9, v3
-; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT: v_or3_b32 v1, v1, v2, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v8
+; GFX9-NEXT: v_or3_b32 v1, v1, v2, v3
; GFX9-NEXT: ds_read_u8 v2, v0 offset:8
-; GFX9-NEXT: ds_read_u8 v6, v0 offset:9
-; GFX9-NEXT: ds_read_u8 v7, v0 offset:10
-; GFX9-NEXT: ds_read_u8 v8, v0 offset:11
-; GFX9-NEXT: ds_read_u8 v9, v0 offset:12
-; GFX9-NEXT: ds_read_u8 v10, v0 offset:13
-; GFX9-NEXT: ds_read_u8 v11, v0 offset:14
+; GFX9-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX9-NEXT: ds_read_u8 v5, v0 offset:10
+; GFX9-NEXT: ds_read_u8 v6, v0 offset:11
+; GFX9-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX9-NEXT: ds_read_u8 v8, v0 offset:13
+; GFX9-NEXT: ds_read_u8 v9, v0 offset:14
; GFX9-NEXT: ds_read_u8 v0, v0 offset:15
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v2, v2, v3, v6
+; GFX9-NEXT: v_lshl_or_b32 v2, v3, 8, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(5)
-; GFX9-NEXT: v_and_b32_e32 v6, v7, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
-; GFX9-NEXT: v_and_b32_e32 v7, v8, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6
+; GFX9-NEXT: v_or3_b32 v2, v2, v3, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshl_or_b32 v3, v8, 8, v7
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v6, v11, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v9
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, v0, v3
-; GFX9-NEXT: v_and_or_b32 v5, v9, v3, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX9-NEXT: v_or3_b32 v3, v5, v6, v0
+; GFX9-NEXT: v_or3_b32 v3, v3, v5, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: ds_read_u8 v1, v0
; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
; GFX7-NEXT: v_or_b32_e32 v4, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v2, v7, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v5, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v2, v8, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, v9, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:12
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:13
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:14
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v5, v5, v3
-; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v6, v9, v3
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT: v_and_b32_e32 v5, v8, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v6, v10, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v0
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -181,63 +146,45 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT: ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT: ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT: ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT: ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT: ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT: ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT: ds_read_u8 v8, v0
-; GFX10-NEXT: ds_read_u8 v9, v0 offset:4
-; GFX10-NEXT: ds_read_u8 v10, v0 offset:8
-; GFX10-NEXT: ds_read_u8 v12, v0 offset:10
-; GFX10-NEXT: ds_read_u8 v13, v0 offset:11
-; GFX10-NEXT: ds_read_u8 v14, v0 offset:12
-; GFX10-NEXT: ds_read_u8 v15, v0 offset:13
-; GFX10-NEXT: ds_read_u8 v16, v0 offset:14
+; GFX10-NEXT: ds_read_u8 v1, v0
+; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT: ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT: ds_read_u8 v12, v0 offset:11
+; GFX10-NEXT: ds_read_u8 v13, v0 offset:12
+; GFX10-NEXT: ds_read_u8 v14, v0 offset:13
+; GFX10-NEXT: ds_read_u8 v15, v0 offset:14
; GFX10-NEXT: ds_read_u8 v0, v0 offset:15
-; GFX10-NEXT: v_mov_b32_e32 v17, 8
-; GFX10-NEXT: s_mov_b32 s5, 8
-; GFX10-NEXT: v_mov_b32_e32 v11, 0xff
-; GFX10-NEXT: s_movk_i32 s4, 0xff
-; GFX10-NEXT: s_waitcnt lgkmcnt(15)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(14)
-; GFX10-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(13)
-; GFX10-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(12)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT: s_waitcnt lgkmcnt(11)
-; GFX10-NEXT: v_and_b32_e32 v5, v5, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(10)
-; GFX10-NEXT: v_and_b32_e32 v6, v6, v11
+; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5
; GFX10-NEXT: s_waitcnt lgkmcnt(9)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX10-NEXT: s_waitcnt lgkmcnt(8)
-; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(7)
-; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8
+; GFX10-NEXT: s_waitcnt lgkmcnt(6)
+; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
-; GFX10-NEXT: v_and_b32_e32 v8, v12, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
-; GFX10-NEXT: v_and_b32_e32 v9, v13, v11
-; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NEXT: v_and_b32_e32 v12, v16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v15
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, v0, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3
; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6
@@ -252,7 +199,6 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: ds_read_u16 v1, v0
; GFX9-NEXT: ds_read_u16 v2, v0 offset:2
; GFX9-NEXT: ds_read_u16 v3, v0 offset:4
@@ -262,27 +208,18 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
; GFX9-NEXT: ds_read_u16 v7, v0 offset:12
; GFX9-NEXT: ds_read_u16 v8, v0 offset:14
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
-; GFX9-NEXT: v_and_b32_e32 v0, s4, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1
+; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v2, s4, v6
+; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v3, s4, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2
-; GFX9-NEXT: v_and_or_b32 v3, v7, s4, v3
+; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32_align2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_u16 v1, v0
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
@@ -291,63 +228,40 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
; GFX7-NEXT: ds_read_u16 v7, v0 offset:12
; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT: s_mov_b32 s4, 0xffff
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_and_b32_e32 v0, s4, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v6
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v8
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: load_lds_v4i32_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_read_u16 v1, v0 offset:2
-; GFX10-NEXT: ds_read_u16 v2, v0 offset:6
-; GFX10-NEXT: ds_read_u16 v3, v0 offset:10
-; GFX10-NEXT: ds_read_u16 v4, v0 offset:14
-; GFX10-NEXT: ds_read_u16 v5, v0
-; GFX10-NEXT: ds_read_u16 v6, v0 offset:4
-; GFX10-NEXT: ds_read_u16 v7, v0 offset:8
-; GFX10-NEXT: ds_read_u16 v8, v0 offset:12
-; GFX10-NEXT: s_mov_b32 s4, 0xffff
-; GFX10-NEXT: s_waitcnt lgkmcnt(7)
-; GFX10-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX10-NEXT: ds_read_u16 v1, v0
+; GFX10-NEXT: ds_read_u16 v2, v0 offset:2
+; GFX10-NEXT: ds_read_u16 v3, v0 offset:4
+; GFX10-NEXT: ds_read_u16 v4, v0 offset:6
+; GFX10-NEXT: ds_read_u16 v5, v0 offset:8
+; GFX10-NEXT: ds_read_u16 v6, v0 offset:10
+; GFX10-NEXT: ds_read_u16 v7, v0 offset:12
+; GFX10-NEXT: ds_read_u16 v8, v0 offset:14
; GFX10-NEXT: s_waitcnt lgkmcnt(6)
-; GFX10-NEXT: v_and_b32_e32 v1, s4, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(5)
-; GFX10-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
-; GFX10-NEXT: v_and_b32_e32 v3, s4, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: s_waitcnt lgkmcnt(3)
-; GFX10-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-NEXT: v_and_or_b32 v1, v6, s4, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NEXT: v_and_or_b32 v2, v7, s4, v2
+; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_and_or_b32 v3, v8, s4, v3
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
ret <4 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index a79c9ebc618c0..fe6bf1504877c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -38,112 +38,83 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_u8 v1, v0
-; GFX9-NEXT: ds_read_u8 v3, v0 offset:1
-; GFX9-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX9-NEXT: ds_read_u8 v5, v0 offset:3
-; GFX9-NEXT: ds_read_u8 v6, v0 offset:4
-; GFX9-NEXT: ds_read_u8 v7, v0 offset:5
-; GFX9-NEXT: ds_read_u8 v8, v0 offset:6
-; GFX9-NEXT: ds_read_u8 v9, v0 offset:7
-; GFX9-NEXT: s_mov_b32 s5, 8
-; GFX9-NEXT: s_movk_i32 s4, 0xff
+; GFX9-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX9-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX9-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX9-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX9-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX9-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX9-NEXT: ds_read_u8 v8, v0 offset:7
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(5)
-; GFX9-NEXT: v_and_b32_e32 v3, s4, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
-; GFX9-NEXT: v_and_b32_e32 v4, s4, v5
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT: v_or3_b32 v3, v1, v3, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GFX9-NEXT: v_or3_b32 v3, v1, v2, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v4, v8, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v5, v9, v2
-; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT: v_or3_b32 v1, v1, v4, v5
-; GFX9-NEXT: ds_read_u8 v4, v0 offset:8
-; GFX9-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX9-NEXT: ds_read_u8 v6, v0 offset:10
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v8
+; GFX9-NEXT: v_or3_b32 v1, v1, v2, v4
+; GFX9-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX9-NEXT: ds_read_u8 v4, v0 offset:9
+; GFX9-NEXT: ds_read_u8 v5, v0 offset:10
; GFX9-NEXT: ds_read_u8 v0, v0 offset:11
-; GFX9-NEXT: v_mov_b32_e32 v7, 8
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v4, v4, v2, v5
+; GFX9-NEXT: v_lshl_or_b32 v2, v4, 8, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v5, v6, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, v0, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX9-NEXT: v_or3_b32 v2, v4, v5, v0
+; GFX9-NEXT: v_or3_b32 v2, v2, v4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: ds_read_u8 v1, v0
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v4, v7, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v5, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v4, v8, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, v9, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_and_b32_e32 v4, v4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v5, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -151,52 +122,36 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT: ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT: ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT: ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT: ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT: ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT: ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT: ds_read_u8 v8, v0 offset:10
-; GFX10-NEXT: ds_read_u8 v9, v0 offset:11
-; GFX10-NEXT: ds_read_u8 v10, v0
-; GFX10-NEXT: ds_read_u8 v11, v0 offset:4
-; GFX10-NEXT: ds_read_u8 v0, v0 offset:8
-; GFX10-NEXT: v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT: v_mov_b32_e32 v13, 8
-; GFX10-NEXT: s_movk_i32 s4, 0xff
-; GFX10-NEXT: s_mov_b32 s5, 8
-; GFX10-NEXT: s_waitcnt lgkmcnt(11)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: ds_read_u8 v1, v0
+; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT: ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT: ds_read_u8 v0, v0 offset:11
; GFX10-NEXT: s_waitcnt lgkmcnt(10)
-; GFX10-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(9)
-; GFX10-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(8)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT: s_waitcnt lgkmcnt(7)
-; GFX10-NEXT: v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(6)
-; GFX10-NEXT: v_and_b32_e32 v6, v6, v12
+; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
-; GFX10-NEXT: v_and_b32_e32 v8, v8, v12
-; GFX10-NEXT: s_waitcnt lgkmcnt(3)
-; GFX10-NEXT: v_and_b32_e32 v9, v9, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v0
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3
; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6
; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9
@@ -215,76 +170,50 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
; GFX9-NEXT: ds_read_u16 v4, v0 offset:6
; GFX9-NEXT: ds_read_u16 v5, v0 offset:8
; GFX9-NEXT: ds_read_u16 v6, v0 offset:10
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
-; GFX9-NEXT: v_and_b32_e32 v0, s4, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v4
+; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, s4, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1
-; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2
+; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32_align2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_u16 v1, v0
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT: s_mov_b32 s4, 0xffff
-; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v0, s4, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v6
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: load_lds_v3i32_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_read_u16 v1, v0 offset:2
-; GFX10-NEXT: ds_read_u16 v2, v0 offset:6
-; GFX10-NEXT: ds_read_u16 v3, v0 offset:10
-; GFX10-NEXT: ds_read_u16 v4, v0
-; GFX10-NEXT: ds_read_u16 v5, v0 offset:4
-; GFX10-NEXT: ds_read_u16 v6, v0 offset:8
-; GFX10-NEXT: s_mov_b32 s4, 0xffff
-; GFX10-NEXT: s_waitcnt lgkmcnt(5)
-; GFX10-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX10-NEXT: ds_read_u16 v1, v0
+; GFX10-NEXT: ds_read_u16 v2, v0 offset:2
+; GFX10-NEXT: ds_read_u16 v3, v0 offset:4
+; GFX10-NEXT: ds_read_u16 v4, v0 offset:6
+; GFX10-NEXT: ds_read_u16 v5, v0 offset:8
+; GFX10-NEXT: ds_read_u16 v6, v0 offset:10
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
-; GFX10-NEXT: v_and_b32_e32 v1, s4, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(3)
-; GFX10-NEXT: v_and_b32_e32 v2, s4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-NEXT: v_and_or_b32 v0, v4, s4, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_and_or_b32 v2, v6, s4, v2
+; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 70a351ed65c40..b573142f5c4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -18,77 +18,58 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX7-LABEL: load_lds_v4i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: ds_read_u8 v1, v0
; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
; GFX7-NEXT: v_or_b32_e32 v4, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v2, v7, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v5, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v2, v8, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, v9, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:12
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:13
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:14
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v5, v5, v3
-; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v6, v9, v3
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT: v_and_b32_e32 v5, v8, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v6, v10, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v0
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -96,63 +77,45 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT: ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT: ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT: ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT: ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT: ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT: ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT: ds_read_u8 v8, v0
-; GFX10-NEXT: ds_read_u8 v9, v0 offset:4
-; GFX10-NEXT: ds_read_u8 v10, v0 offset:8
-; GFX10-NEXT: ds_read_u8 v12, v0 offset:10
-; GFX10-NEXT: ds_read_u8 v13, v0 offset:11
-; GFX10-NEXT: ds_read_u8 v14, v0 offset:12
-; GFX10-NEXT: ds_read_u8 v15, v0 offset:13
-; GFX10-NEXT: ds_read_u8 v16, v0 offset:14
+; GFX10-NEXT: ds_read_u8 v1, v0
+; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT: ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT: ds_read_u8 v12, v0 offset:11
+; GFX10-NEXT: ds_read_u8 v13, v0 offset:12
+; GFX10-NEXT: ds_read_u8 v14, v0 offset:13
+; GFX10-NEXT: ds_read_u8 v15, v0 offset:14
; GFX10-NEXT: ds_read_u8 v0, v0 offset:15
-; GFX10-NEXT: v_mov_b32_e32 v17, 8
-; GFX10-NEXT: s_mov_b32 s5, 8
-; GFX10-NEXT: v_mov_b32_e32 v11, 0xff
-; GFX10-NEXT: s_movk_i32 s4, 0xff
-; GFX10-NEXT: s_waitcnt lgkmcnt(15)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(14)
-; GFX10-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(13)
-; GFX10-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(12)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT: s_waitcnt lgkmcnt(11)
-; GFX10-NEXT: v_and_b32_e32 v5, v5, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(10)
-; GFX10-NEXT: v_and_b32_e32 v6, v6, v11
+; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5
; GFX10-NEXT: s_waitcnt lgkmcnt(9)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX10-NEXT: s_waitcnt lgkmcnt(8)
-; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(7)
-; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8
+; GFX10-NEXT: s_waitcnt lgkmcnt(6)
+; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
-; GFX10-NEXT: v_and_b32_e32 v8, v12, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
-; GFX10-NEXT: v_and_b32_e32 v9, v13, v11
-; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NEXT: v_and_b32_e32 v12, v16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v15
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, v0, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3
; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6
@@ -174,61 +137,45 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX7-LABEL: load_lds_v3i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: ds_read_u8 v1, v0
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:7
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v3, s4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v4, v7, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v5, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v4, v8, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, v9, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_and_b32_e32 v4, v4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v5, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -236,52 +183,36 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX10-NEXT: ds_read_u8 v2, v0 offset:2
-; GFX10-NEXT: ds_read_u8 v3, v0 offset:3
-; GFX10-NEXT: ds_read_u8 v4, v0 offset:5
-; GFX10-NEXT: ds_read_u8 v5, v0 offset:6
-; GFX10-NEXT: ds_read_u8 v6, v0 offset:7
-; GFX10-NEXT: ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT: ds_read_u8 v8, v0 offset:10
-; GFX10-NEXT: ds_read_u8 v9, v0 offset:11
-; GFX10-NEXT: ds_read_u8 v10, v0
-; GFX10-NEXT: ds_read_u8 v11, v0 offset:4
-; GFX10-NEXT: ds_read_u8 v0, v0 offset:8
-; GFX10-NEXT: v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT: v_mov_b32_e32 v13, 8
-; GFX10-NEXT: s_movk_i32 s4, 0xff
-; GFX10-NEXT: s_mov_b32 s5, 8
-; GFX10-NEXT: s_waitcnt lgkmcnt(11)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: ds_read_u8 v1, v0
+; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
+; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
+; GFX10-NEXT: ds_read_u8 v11, v0 offset:10
+; GFX10-NEXT: ds_read_u8 v0, v0 offset:11
; GFX10-NEXT: s_waitcnt lgkmcnt(10)
-; GFX10-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(9)
-; GFX10-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(8)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT: s_waitcnt lgkmcnt(7)
-; GFX10-NEXT: v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(6)
-; GFX10-NEXT: v_and_b32_e32 v6, v6, v12
+; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
-; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
-; GFX10-NEXT: v_and_b32_e32 v8, v8, v12
-; GFX10-NEXT: s_waitcnt lgkmcnt(3)
-; GFX10-NEXT: v_and_b32_e32 v9, v9, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v0
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3
; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6
; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
index 9c1ce9cff7fed..fc8a367c5491a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
@@ -12,8 +12,8 @@ body: |
; CHECK-LABEL: name: remove_and_255_zextload
; CHECK: liveins: $vgpr0_vgpr1
; CHECK: %ptr:_(p1) = COPY $vgpr0_vgpr1
- ; CHECK: %load:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load (s8), addrspace 1)
- ; CHECK: $vgpr0 = COPY %load(s32)
+ ; CHECK: %and:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load (s8), addrspace 1)
+ ; CHECK: $vgpr0 = COPY %and(s32)
%ptr:_(p1) = COPY $vgpr0_vgpr1
%load:_(s32) = G_ZEXTLOAD %ptr :: (load (s8), addrspace 1, align 1)
%mask:_(s32) = G_CONSTANT i32 255
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir
new file mode 100644
index 0000000000000..a16fcf464e59e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir
@@ -0,0 +1,24 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
+
+# Post-legalizer should not generate illegal extending loads
+---
+name: zextload_from_load_and_mask
+tracksRegLiveness: true
+legalized: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: zextload_from_load_and_mask
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
+ ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+ ; CHECK: $vgpr0_vgpr1 = COPY [[AND]](s64)
+ %0:_(p1) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_CONSTANT i64 255
+ %2:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1)
+ %3:_(s64) = G_AND %2, %1
+ $vgpr0_vgpr1 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 5199f033acf11..97591c7fbe1f9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -501,7 +501,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
@@ -1393,7 +1393,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 19aec679d09ac..72bb5ea5e310c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -387,7 +387,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -945,7 +945,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 6bf77bc93b1c5..52a3555c129df 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -851,21 +851,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
@@ -1178,21 +1173,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
@@ -1304,21 +1294,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
@@ -1438,21 +1423,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 189715bfbb6cf..91d952269757a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -350,39 +350,30 @@ define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> add
; ALIGNED-GISEL-LABEL: ds12align1:
; ALIGNED-GISEL: ; %bb.0:
; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT: s_mov_b32 s3, 8
-; ALIGNED-GISEL-NEXT: s_movk_i32 s2, 0xff
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 0xff
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v2
-; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:1
-; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:2
-; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:3
-; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v2 offset:4
-; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v2 offset:5
-; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v2 offset:6
-; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v2 offset:7
+; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v2 offset:1
+; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:2
+; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:3
+; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:4
+; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v2 offset:5
+; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v2 offset:6
+; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v2 offset:7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v0, s2, v3
+; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-GISEL-NEXT: v_and_b32_e32 v3, s2, v4
+; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, s2, v5
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v1, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v6, 8, v5
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
-; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, v8, v1
+; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_and_b32_e32 v1, v9, v1
-; ALIGNED-GISEL-NEXT: v_and_or_b32 v3, v6, s2, v3
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v3, v4, v1
+; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v8
+; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v1, v3, v4
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:8
; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:9
; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:10
@@ -453,34 +444,29 @@ define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> add
; ALIGNED-GISEL-LABEL: ds12align2:
; ALIGNED-GISEL: ; %bb.0:
; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 0xffff
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
-; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
-; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
-; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
-; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
-; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10
+; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:2
+; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:4
+; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:6
+; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:8
+; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:10
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_and_b32_e32 v0, s2, v2
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v3, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_and_b32_e32 v2, s2, v4
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v1, s2, v0
-; ALIGNED-GISEL-NEXT: v_and_or_b32 v1, v3, s2, v2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v0
-; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v2 offset:2
+; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v3 offset:2
; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v1 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v0 offset:6
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v1 offset:4
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 offset:6
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v5 offset:8
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v6 offset:8
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v6 offset:10
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v7 offset:10
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds12align2:
More information about the llvm-commits
mailing list