[llvm] [GISel][AMDGPU] Expand ShuffleVector (PR #124527)

Alan Li via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 9 09:09:30 PDT 2025


https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/124527

>From 661b8fc4105470772288aaa5d1300cec345b6ab9 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 27 Jan 2025 18:26:19 +0800
Subject: [PATCH 1/2] [GISel][AMDGPU] Expand ShuffleVector

This patch dismantles G_SHUFFLE_VECTOR before lowering.
The original lowering would emit extract vector element ops.
By using unmerged values and avoid constants used in extract_element_elt,
the build vector op combine could find ways to fold.

Only enabled on AMDGPU.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   4 +
 .../include/llvm/Target/GlobalISel/Combine.td |   7 +
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  40 +++++
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   3 +-
 .../prelegalizer-combiner-shuffle.mir         | 137 ++++++++++++++++++
 .../AMDGPU/GlobalISel/shufflevector.ll        |  18 +++
 ...ffer-fat-pointers-contents-legalization.ll |  27 +---
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    |   6 +-
 .../llvm.amdgcn.raw.tbuffer.store.d16.ll      |   2 -
 .../llvm.amdgcn.struct.tbuffer.store.d16.ll   |   3 +-
 llvm/test/CodeGen/AMDGPU/mad-mix.ll           |  16 +-
 11 files changed, 222 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 93b424d27fdf1..c81373d85fd5b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -261,6 +261,10 @@ class CombinerHelper {
   void applyCombineShuffleConcat(MachineInstr &MI,
                                  SmallVector<Register> &Ops) const;
 
+  /// Replace \p MI with a build_vector.
+  bool matchCombineShuffleToBuildVector(MachineInstr &MI) const;
+  void applyCombineShuffleToBuildVector(MachineInstr &MI) const;
+
   /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
   /// Returns true if MI changed.
   ///
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index deed9315c72d8..5309d5952f087 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1571,6 +1571,13 @@ def combine_shuffle_concat : GICombineRule<
         [{ return Helper.matchCombineShuffleConcat(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShuffleConcat(*${root}, ${matchinfo}); }])>;
 
+// Combines shuffles of vector into build_vector
+def combine_shuffle_vector_to_build_vector : GICombineRule<
+  (defs root:$root),
+  (match (G_SHUFFLE_VECTOR $dst, $src1, $src2, $mask):$root,
+    [{ return Helper.matchCombineShuffleToBuildVector(*${root}); }]),
+  (apply [{ Helper.applyCombineShuffleToBuildVector(*${root}); }])>;
+
 def insert_vector_element_idx_undef : GICombineRule<
    (defs root:$root),
    (match (G_IMPLICIT_DEF $idx),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index fed1dc53f9bb8..dbc838a3782ca 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -385,6 +386,45 @@ void CombinerHelper::applyCombineConcatVectors(
   MI.eraseFromParent();
 }
 
+bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Invalid instruction");
+  auto &Shuffle = cast<GShuffleVector>(MI);
+
+  Register SrcVec1 = Shuffle.getSrc1Reg();
+  Register SrcVec2 = Shuffle.getSrc2Reg();
+
+  LLT SrcVec1Type = MRI.getType(SrcVec1);
+  LLT SrcVec2Type = MRI.getType(SrcVec2);
+  return SrcVec1Type.isVector() && SrcVec2Type.isVector();
+}
+
+void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const {
+  auto &Shuffle = cast<GShuffleVector>(MI);
+
+  Register SrcVec1 = Shuffle.getSrc1Reg();
+  Register SrcVec2 = Shuffle.getSrc2Reg();
+  LLT EltTy = MRI.getType(SrcVec1).getElementType();
+  int Width = MRI.getType(SrcVec1).getNumElements();
+
+  auto Unmerge1 = Builder.buildUnmerge(EltTy, SrcVec1);
+  auto Unmerge2 = Builder.buildUnmerge(EltTy, SrcVec2);
+
+  SmallVector<Register> Extracts;
+  // Select only applicable elements from unmerged values.
+  for (int Val : Shuffle.getMask()) {
+    if (Val == -1)
+      Extracts.push_back(Builder.buildUndef(EltTy).getReg(0));
+    else if (Val < Width)
+      Extracts.push_back(Unmerge1.getReg(Val));
+    else
+      Extracts.push_back(Unmerge2.getReg(Val - Width));
+  }
+
+  Builder.buildBuildVector(MI.getOperand(0).getReg(), Extracts);
+  MI.eraseFromParent();
+}
+
 bool CombinerHelper::matchCombineShuffleConcat(
     MachineInstr &MI, SmallVector<Register> &Ops) const {
   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a21505356274b..d598395d79e8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -163,7 +163,8 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
 
 def AMDGPUPreLegalizerCombiner: GICombiner<
   "AMDGPUPreLegalizerCombinerImpl",
-  [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg]> {
+  [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
+   foldable_fneg, combine_shuffle_vector_to_build_vector]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
new file mode 100644
index 0000000000000..bba608cceee19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
@@ -0,0 +1,137 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: shuffle_vector_to_extract
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: shuffle_vector_to_extract
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), addrspace 3)
+    ; CHECK-NEXT: SI_RETURN
+    %0:_(p3) = COPY $vgpr0
+    %1:_(p3) = COPY $vgpr1
+    %12:_(<8 x s16>) = G_IMPLICIT_DEF
+    %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    %11:_(<4 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(4, 5, 6, 7)
+    G_STORE %11(<4 x s16>), %1(p3) :: (store (<4 x s16>), addrspace 3)
+    SI_RETURN
+...
+
+---
+name: shuffle_vector_to_extract2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: shuffle_vector_to_extract2
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[UV3]](s16), [[UV4]](s16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s16>), [[COPY1]](p3) :: (store (<2 x s16>), addrspace 3)
+    ; CHECK-NEXT: SI_RETURN
+    %0:_(p3) = COPY $vgpr0
+    %1:_(p3) = COPY $vgpr1
+    %12:_(<8 x s16>) = G_IMPLICIT_DEF
+    %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    %11:_(<2 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(3, 4)
+    G_STORE %11(<2 x s16>), %1(p3) :: (store (<2 x s16>), addrspace 3)
+    SI_RETURN
+
+...
+
+---
+name: shuffle_vector_to_extract_odd_elements
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: shuffle_vector_to_extract_odd_elements
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<3 x s16>), [[COPY1]](p3) :: (store (<3 x s16>), align 8, addrspace 3)
+    ; CHECK-NEXT: SI_RETURN
+    %0:_(p3) = COPY $vgpr0
+    %1:_(p3) = COPY $vgpr1
+    %12:_(<8 x s16>) = G_IMPLICIT_DEF
+    %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    %11:_(<3 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(0, 1, 2)
+    G_STORE %11(<3 x s16>), %1(p3) :: (store (<3 x s16>), addrspace 3)
+    SI_RETURN
+...
+
+
+---
+name: shuffle_vector_to_extract_minus_1_no_conversion
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: shuffle_vector_to_extract_minus_1_no_conversion
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[DEF]](s16), [[UV7]](s16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), addrspace 3)
+    ; CHECK-NEXT: SI_RETURN
+    %0:_(p3) = COPY $vgpr0
+    %1:_(p3) = COPY $vgpr1
+    %12:_(<8 x s16>) = G_IMPLICIT_DEF
+    %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    %11:_(<4 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(4, 5, -1, 7)
+    G_STORE %11(<4 x s16>), %1(p3) :: (store (<4 x s16>), addrspace 3)
+    SI_RETURN
+...
+
+---
+name: shuffle_vector_to_extract_across_vectors_no_conversion
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: shuffle_vector_to_extract_across_vectors_no_conversion
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV6]](s16), [[UV7]](s16), [[DEF]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), addrspace 3)
+    ; CHECK-NEXT: SI_RETURN
+    %0:_(p3) = COPY $vgpr0
+    %1:_(p3) = COPY $vgpr1
+    %12:_(<8 x s16>) = G_IMPLICIT_DEF
+    %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+    %11:_(<4 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(6, 7, 8, 9)
+    G_STORE %11(<4 x s16>), %1(p3) :: (store (<4 x s16>), addrspace 3)
+    SI_RETURN
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
new file mode 100644
index 0000000000000..09274c4d3626b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
+
+define void @shuffle_to_extract(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+; GFX942-LABEL: shuffle_to_extract:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ds_read2_b64 v[2:5], v0 offset1:1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    ds_write_b64 v1, v[4:5]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <8 x half>, ptr addrspace(3) %in, align 8
+  %res = shufflevector <8 x half> %val, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x half> %res, ptr addrspace(3) %out, align 8
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 405058b24dcc2..fdc1dd6cce8e1 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -1736,10 +1736,6 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v2, off, s[16:19], 0 offset:8
-; GISEL-NEXT:    s_mov_b32 s4, 0xffff
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1820,11 +1816,6 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v3, off, s[16:19], 0 offset:12
-; GISEL-NEXT:    s_mov_b32 s4, 0xffff
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
-; GISEL-NEXT:    v_bfi_b32 v2, s4, v2, v2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1867,12 +1858,6 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:16
-; GISEL-NEXT:    s_mov_b32 s4, 0xffff
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
-; GISEL-NEXT:    v_bfi_b32 v2, s4, v2, v2
-; GISEL-NEXT:    v_bfi_b32 v3, s4, v3, v3
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2181,14 +2166,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v6i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <6 x i8>, ptr addrspace(7) %p
@@ -3644,11 +3629,11 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load volatile <6 x i8>, ptr addrspace(7) %p
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 33e5d1d2ca473..81ef7351b84e9 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -9659,13 +9659,13 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX8-GISEL-NEXT:    v_mad_u16 v6, v4, v5, v6
 ; GFX8-GISEL-NEXT:    v_mad_u16 v2, v0, v1, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX8-GISEL-NEXT:    v_mad_u16 v0, v0, v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u16 v1, v4, v5, v6
+; GFX8-GISEL-NEXT:    v_mad_u16 v1, v4, v5, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 052f7f1c8310b..d976c7992aff5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -187,8 +187,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-PACKED-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT:    s_pack_lh_b32_b16 s6, s6, s6
-; GFX12-PACKED-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX12-PACKED-GISEL-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index d025e7a15e25a..a9e561da98db6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -208,10 +208,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b96 s[8:10], s[4:5], 0x10
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX12-PACKED-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT:    s_pack_lh_b32_b16 s8, s8, s8
-; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-PACKED-GISEL-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
 ; GFX12-PACKED-GISEL-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index 39d2578a088fa..e1e356a92f28e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -440,21 +440,13 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT:    v_or_b32_e32 v0, v1, v0
-; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GISEL-CI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GISEL-CI-NEXT:    v_or_b32_e32 v1, v1, v4
-; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v4, v0, v1
-; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v5, v2
+; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v6, v2
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.shuf = shufflevector <2 x half> %src0, <2 x half> poison, <2 x i32> <i32 1, i32 0>
   %src1.shuf = shufflevector <2 x half> %src1, <2 x half> poison, <2 x i32> <i32 0, i32 1>

>From c96cd322eaf176f8324072fdf0d4b0199967c9dc Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 9 Apr 2025 12:08:53 -0400
Subject: [PATCH 2/2] rebase to latest

---
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |    6 +-
 llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll |  866 ++--------
 llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll |  915 ++---------
 llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll |  866 ++--------
 llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll  | 1140 +++----------
 .../test/CodeGen/AMDGPU/vector-reduce-smax.ll | 1462 +++++------------
 .../test/CodeGen/AMDGPU/vector-reduce-smin.ll | 1462 +++++------------
 .../test/CodeGen/AMDGPU/vector-reduce-umax.ll | 1428 +++++-----------
 .../test/CodeGen/AMDGPU/vector-reduce-umin.ll | 1026 +++---------
 llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll |  983 ++---------
 10 files changed, 2136 insertions(+), 8018 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 28a995e74f7ab..ecd1abce67571 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -2016,9 +2016,11 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    ds_read_b64 v[2:3], v2 offset:8
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; PACKED-GISEL-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; PACKED-GISEL-NEXT:    v_xor_b32_e32 v5, 0x80000000, v2
-; PACKED-GISEL-NEXT:    v_xor_b32_e32 v4, 0x80000000, v3
+; PACKED-GISEL-NEXT:    v_mov_b32_e32 v4, v3
+; PACKED-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[4:5]
 ; PACKED-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
index 5404d402828b0..1d921b0d6e254 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
@@ -175,25 +175,8 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -211,20 +194,8 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -242,20 +213,8 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v6, v2, v3
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -275,21 +234,8 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v5, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -313,28 +259,8 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -367,28 +293,8 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -419,42 +325,12 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -476,33 +352,12 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -524,32 +379,12 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -573,35 +408,12 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -630,49 +442,13 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -710,49 +486,13 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -791,76 +531,20 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -890,59 +574,20 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, v5, v11
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v4, v4, v10
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v9
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v8
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v9
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v2, v10
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, v3, v11
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v4, v4, v12
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, v5, v13
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v6, v6, v14
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v7, v7, v15
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -972,56 +617,20 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v10, v12, v16, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-GISEL-NEXT:    v_or3_b32 v10, v10, v11, v12
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX9-GISEL-NEXT:    v_add_u16_e32 v5, v5, v11
-; GFX9-GISEL-NEXT:    v_add_u16_e32 v4, v4, v10
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v9
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v2, v2, v10
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, v3, v11
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v4, v4, v12
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v5, v5, v13
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v6, v6, v14
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v7, v7, v15
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1053,63 +662,20 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX10-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 8, v12
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v5, v5, v10
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v6, v6, v13
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v7, v7, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 24, v8
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v7
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, v9
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v3, v3, v10
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, v10
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v3, v3, v11
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v4, v4, v12
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v5, v5, v13
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v6, v6, v14
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v7, v7, v15
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1147,87 +713,23 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, v2, v10
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v3, v3, v11
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v4, v4, v12
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v6, v6, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v7, v7, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v6, v6, v14
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v7, v7, v15
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -1274,87 +776,23 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v2, v2, v10
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v3, v3, v11
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v4, v4, v12
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v6, v6, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v7, v7, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v6, v6, v14
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v7, v7, v15
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -1374,10 +812,6 @@ define i16 @test_vector_reduce_add_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1590,17 +1024,8 @@ define i16 @test_vector_reduce_add_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v4i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1617,10 +1042,9 @@ define i16 @test_vector_reduce_add_v4i16(<4 x i16> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v0, v1
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1726,30 +1150,12 @@ define i16 @test_vector_reduce_add_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1772,13 +1178,11 @@ define i16 @test_vector_reduce_add_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v1, v3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v4, v1
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v4, v2
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1914,56 +1318,20 @@ define i16 @test_vector_reduce_add_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_add_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1996,21 +1364,17 @@ define i16 @test_vector_reduce_add_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v4, v1, v5
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, v2, v6
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, v3, v7
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, v8, v2
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v4, v3
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v5, v1
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    v_add_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v6, v3, v7
+; GFX8-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, v8, v5
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v4, v6
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, v5, v2
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
index 0c497d3308881..4eba4ff954b1f 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
@@ -233,24 +233,8 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -267,20 +251,8 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -297,20 +269,8 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v5, v0, v4, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v5, v2, v3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -327,19 +287,8 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -357,29 +306,10 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -404,29 +334,10 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -451,40 +362,12 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -505,31 +388,12 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -550,30 +414,12 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -594,29 +440,12 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -639,46 +468,13 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -710,46 +506,13 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -784,73 +547,20 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, v6, v14
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -879,55 +589,20 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, v5, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v4, v4, v10
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v7, v7, v10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, v6, v14
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v7, v7, v15
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -956,52 +631,20 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v10, v12, v16, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-GISEL-NEXT:    v_or3_b32 v10, v10, v11, v12
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, v5, v11
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, v4, v10
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v7, v7, v10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v7
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, v6, v14
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, v7, v15
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1030,51 +673,20 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v6, v6, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v7, v7, v12 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, v6, v14
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, v7, v15
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1106,84 +718,25 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v10, v11
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 24, v12
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, v6, v14
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, v7, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, v7, v15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1222,84 +775,25 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v10, v11
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 24, v12
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, v6, v14
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, v7, v13
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, v7, v15
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1322,10 +816,10 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_and_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1484,11 +978,8 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1570,32 +1061,20 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1613,17 +1092,8 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1640,13 +1110,10 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_bfi_b32 v2, s0, v1, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_and_v8i16:
@@ -1661,11 +1128,9 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1684,15 +1149,13 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_and_v8i16:
@@ -1718,15 +1181,13 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v)
@@ -1758,64 +1219,36 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v14
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1837,33 +1270,12 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1884,19 +1296,14 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, v1, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v3, v7
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v4, v0, v4
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v2, v6
-; GFX9-GISEL-NEXT:    v_bfi_b32 v1, s0, v1, v1
-; GFX9-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v4, v0
-; GFX9-GISEL-NEXT:    v_bfi_b32 v2, s0, v1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_and_v16i16:
@@ -1915,17 +1322,13 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1949,23 +1352,18 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_and_v16i16:
@@ -1996,23 +1394,18 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index c8a6a58567623..e035256694ad5 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -235,25 +235,8 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -270,20 +253,8 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -300,20 +271,8 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v6, v2, v3
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -330,21 +289,8 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v5, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -362,28 +308,8 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -410,28 +336,8 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -458,42 +364,12 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -514,33 +390,12 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -561,32 +416,12 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -607,35 +442,12 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -658,49 +470,13 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -732,49 +508,13 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -809,76 +549,20 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v4, v4, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v8
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v9
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v4, v4, v12
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v13
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v14
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v7, v7, v15
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -907,59 +591,20 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v5, v11
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v4, v4, v10
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v9
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v8
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v9
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v2, v10
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v3, v3, v11
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v4, v4, v12
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v5, v13
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v6, v6, v14
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v7, v7, v15
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -988,56 +633,20 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v10, v12, v16, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-GISEL-NEXT:    v_or3_b32 v10, v10, v11, v12
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v5, v11
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v4, v4, v10
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v9
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v2, v10
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v3, v3, v11
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v4, v4, v12
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v5, v13
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v6, v6, v14
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v7, v7, v15
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1066,63 +675,20 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX10-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 8, v12
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v5, v5, v10
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v6, v6, v13
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v7, v7, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 24, v8
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v7
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v9
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v10
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v10
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v11
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v4, v4, v12
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v5, v5, v13
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v6, v6, v14
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v7, v7, v15
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1154,87 +720,23 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v10
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v11
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v4, v4, v12
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v6, v6, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v7, v7, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v6, v6, v14
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v7, v7, v15
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -1275,87 +777,23 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v10
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v11
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v4, v4, v12
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v6, v6, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v7, v7, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v6, v6, v14
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v7, v7, v15
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
@@ -1380,10 +818,6 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1595,17 +1029,8 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1622,9 +1047,8 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i16:
@@ -1726,30 +1150,12 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1772,12 +1178,10 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v1, v3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v4, v1
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v4, v2
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i16:
@@ -1909,56 +1313,20 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v14
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v4, v4, v10
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v11
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v12
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v8
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v9
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v10
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v11
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v4, v4, v12
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v13
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v14
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v7, v7, v15
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1991,20 +1359,16 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v4, v1, v5
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v2, v6
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v3, v7
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v8, v2
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v4, v3
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v5, v1
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v6, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v5, v8, v5
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v4, v6
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v5, v2
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index 3fbaa88be2c75..46b6e0079a99c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -228,27 +228,8 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, 0xff0000, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, 0xff000000, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, 0xff00, v0
@@ -268,22 +249,8 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, 0xff0000, v1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, 0xff000000, v1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -303,27 +270,14 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v6, v2, v3
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff0000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff000000
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff00
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff0000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
+; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_or_v4i8:
@@ -338,25 +292,12 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v5, v2, v3
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff00
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_or_v4i8:
@@ -372,36 +313,14 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff00
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_or_v4i8:
@@ -425,36 +344,14 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff00
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
@@ -478,44 +375,12 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, 0xff0000, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, 0xff000000, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, 0xff00, v0
@@ -539,35 +404,12 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, 0xff0000, v1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, 0xff000000, v1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -591,40 +433,17 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v7
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_or3_b32 v0, v6, v7, v0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff0000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff000000
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff00
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff0000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
+; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_or_v8i8:
@@ -643,38 +462,15 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v7, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v4, v6, v0
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff00
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_or_v8i8:
@@ -695,59 +491,17 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_or3_b32 v5, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v4, v7, v0
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v2, v2, v9
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff00
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_or_v8i8:
@@ -776,59 +530,17 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or3_b32 v5, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v4, v7, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v2, v2, v9
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff00
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
@@ -860,78 +572,20 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, 0xff0000, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, 0xff000000, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, 0xff00, v0
@@ -963,61 +617,20 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v10
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, 0xff0000, v1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, 0xff000000, v1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -1049,64 +662,22 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v12, v12, v16, v13
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v14, v12, v13
-; GFX9-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v15
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v14, v15, v4
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v6, v6, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v7, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v5
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v10, v6
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
 ; GFX9-GISEL-NEXT:    v_or3_b32 v3, v3, v11, v7
 ; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff0000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff000000
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v10, v6
+; GFX9-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff00
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff0000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
+; GFX9-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_or_v16i8:
@@ -1133,62 +704,20 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX10-GISEL-NEXT:    v_or3_b32 v15, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v10, v14, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 8, v15
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v6, v6, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v5, v5, v17
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v7, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v4
-; GFX10-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v9
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v2, v6, v10
-; GFX10-GISEL-NEXT:    v_or3_b32 v3, v3, v8, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX10-GISEL-NEXT:    v_or3_b32 v3, v3, v11, v7
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v8, v4
+; GFX10-GISEL-NEXT:    v_or3_b32 v2, v2, v10, v6
+; GFX10-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_or_v16i8:
@@ -1218,91 +747,23 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_or3_b32 v15, v12, v13, v14
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v16, 8, v15
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v12, v14, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v5, v5, v16
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v6, v6, v17
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_or3_b32 v5, v8, v10, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v4
-; GFX11-GISEL-NEXT:    v_or3_b32 v1, v1, v6, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v2, v8, v10
-; GFX11-GISEL-NEXT:    v_or3_b32 v3, v3, v9, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-GISEL-NEXT:    v_or3_b32 v3, v3, v11, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v8, v4
+; GFX11-GISEL-NEXT:    v_or3_b32 v2, v2, v10, v6
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_or_v16i8:
@@ -1340,91 +801,23 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_or3_b32 v15, v12, v13, v14
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v16, 8, v15
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v12, v14, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v5, v5, v16
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v6, v6, v17
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_or_b32_e32 v7, v7, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_or3_b32 v5, v8, v10, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v4
-; GFX12-GISEL-NEXT:    v_or3_b32 v1, v1, v6, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v2, v8, v10
-; GFX12-GISEL-NEXT:    v_or3_b32 v3, v3, v9, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX12-GISEL-NEXT:    v_or3_b32 v3, v3, v11, v7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v8, v4
+; GFX12-GISEL-NEXT:    v_or3_b32 v2, v2, v10, v6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff000000
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v1, 0xff0000, v1, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff00
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0xff000000
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v)
@@ -1446,10 +839,10 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_or_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1610,11 +1003,8 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1696,32 +1086,20 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1739,17 +1117,8 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1765,13 +1134,10 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_bfi_b32 v2, s0, v1, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_or_v8i16:
@@ -1785,11 +1151,9 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s4, v1
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1807,15 +1171,13 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_or_v8i16:
@@ -1840,15 +1202,13 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v)
@@ -1880,64 +1240,36 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v14
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1959,33 +1291,12 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2004,19 +1315,14 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v5, v1, v5
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v3, v7
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v2, v6
-; GFX9-GISEL-NEXT:    v_bfi_b32 v1, s0, v1, v1
-; GFX9-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v0
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX9-GISEL-NEXT:    v_bfi_b32 v2, s0, v1, v1
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_or_v16i16:
@@ -2033,17 +1339,13 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s4, v1
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2064,23 +1366,18 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_or_v16i16:
@@ -2108,23 +1405,18 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index 8d69e76605acd..05d826872da34 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -260,45 +260,21 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v5
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, s4, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
@@ -321,39 +297,18 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v5, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -380,32 +335,17 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v6, v6, v7, v8
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_or3_b32 v6, v6, v7, v8
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v1, s0, v1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v3, s0, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
@@ -432,45 +372,27 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, s4
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v2, s4
 ; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v5, v5, v6, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_max_i16 v2, v1, s4
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v4, s4
 ; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v5, 24, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 16, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v5
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, s4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v5, v5, v6, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 8, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v5
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_smax_v4i8:
@@ -493,56 +415,35 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, s0
-; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
+; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_max_i16 v4, v2, s0
 ; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_max_i16 v5, v1, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX11-GISEL-NEXT:    v_max_i16 v2, v4, s0
+; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_smax_v4i8:
@@ -573,58 +474,35 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v4, v2, s0
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, s0
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_max_i16 v5, v1, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_max_i16 v2, v4, s0
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v5
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v)
@@ -653,63 +531,29 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 0, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v6, 0, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v7, 0, 8
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v5
-; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, s4, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
@@ -736,58 +580,27 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v5, v8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v5
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v4, v8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
@@ -817,45 +630,23 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v1, s0, v1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v8, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -887,60 +678,35 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v4, 16, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v4, 24, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, v6
+; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, s4
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v2, s4
 ; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_max_i16 v2, v1, s4
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v4, s4
 ; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, s4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_smax_v8i8:
@@ -971,77 +737,43 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v6, 0, 8
 ; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, v5
+; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, s0
 ; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
+; GFX11-GISEL-NEXT:    v_max_i16 v4, v2, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_max_i16 v5, v1, s0
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, s0
+; GFX11-GISEL-NEXT:    v_max_i16 v2, v4, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_smax_v8i8:
@@ -1080,79 +812,43 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v6, 0, 8
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, v5
+; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, s0
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_max_i16 v4, v2, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v5, v1, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v2, v4, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
@@ -1195,103 +891,45 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v10, v8, 0, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v10
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v10, v8, 8, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v10
-; GFX7-GISEL-NEXT:    v_bfe_i32 v10, v8, 16, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 24, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 0, 8
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v12, 0, 8
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v4, v4, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 8, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v13, 0, 8
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 16, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v14, 0, 8
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v6, v6, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 24, 8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v15, 0, 8
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v10
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v5
-; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, s4, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, s4, v3
@@ -1326,98 +964,43 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v9, v16, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v9
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v10
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v13, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v12
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v4, sext(v4), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v13
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v5, sext(v5), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v14
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v6, sext(v6), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v7, 8, v7
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v15
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v7, sext(v7), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v5, v16, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v4
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v4, v16, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
@@ -1453,67 +1036,31 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX9-GISEL-NEXT:    v_and_or_b32 v9, v12, v16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-GISEL-NEXT:    v_or3_b32 v9, v9, v10, v11
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v5, sext(v5), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v4, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v6, sext(v6), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v7, sext(v7), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
 ; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v3, v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v4, sext(v4), sext(v12) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v3, v3, v7
 ; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v1, s0, v1
-; GFX9-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_max_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v16, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1559,94 +1106,51 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX10-GISEL-NEXT:    v_or3_b32 v11, v12, v13, v14
+; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v11, 8, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v10, v11, 16, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v12, v11, 24, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v11, v11, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v5, v5, v9
-; GFX10-GISEL-NEXT:    v_max_i16 v6, v6, v10
-; GFX10-GISEL-NEXT:    v_max_i16 v7, v7, v12
-; GFX10-GISEL-NEXT:    v_max_i16 v4, v4, v11
-; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v8, 8, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v10, v8, 24, 8
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v8, 16, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, v10
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v8, 0, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v4, 16, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v8, v4, 24, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v5
+; GFX10-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 8
+; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, v11
+; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v13, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v10, v15, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v11, v14, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, v7
-; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, v8
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v12, v12, 0, 8
+; GFX10-GISEL-NEXT:    v_max_i16 v5, v5, v9
+; GFX10-GISEL-NEXT:    v_max_i16 v7, v7, v10
+; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, v8
+; GFX10-GISEL-NEXT:    v_max_i16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v4, v12
 ; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v5
+; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, v7
+; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
+; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_max_i16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v2, s4
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_max_i16 v2, v1, s4
 ; GFX10-GISEL-NEXT:    v_max_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v6, 0xff, v0, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_max_i16 v4, v4, s4
+; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v6, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v4
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_smax_v16i8:
@@ -1691,113 +1195,58 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v13, v12, 8, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v11, v12, 16, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_max_i16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_bfe_i32 v13, v12, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v12, v12, 0, 8
-; GFX11-GISEL-NEXT:    v_max_i16 v6, v6, v11
-; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v8, 16, 8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v7, v7, v13
-; GFX11-GISEL-NEXT:    v_max_i16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v10, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v10, v11, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v11, v13, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v13, v15, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v8, 8, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v8, 24, 8
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v7
-; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v4, 8, 8
-; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, v5
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v8, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v6
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v7
+; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, v10
+; GFX11-GISEL-NEXT:    v_max_i16 v5, v5, v11
+; GFX11-GISEL-NEXT:    v_max_i16 v7, v7, v13
+; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v14, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v10, v12, 0, 8
+; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v5
+; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_max_i16 v5, v6, v9
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_max_i16 v4, v4, v10
+; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
 ; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
+; GFX11-GISEL-NEXT:    v_max_i16 v4, v1, s0
+; GFX11-GISEL-NEXT:    v_max_i16 v5, v2, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_max_i16 v2, v2, s0
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-GISEL-NEXT:    v_max_i16 v4, v5, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_max_i16 v1, v1, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v4
+; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_smax_v16i8:
@@ -1850,115 +1299,58 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v13, v12, 8, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v11, v12, 16, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_max_i16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_bfe_i32 v13, v12, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v12, v12, 0, 8
-; GFX12-GISEL-NEXT:    v_max_i16 v6, v6, v11
-; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v8, 16, 8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_max_i16 v7, v7, v13
-; GFX12-GISEL-NEXT:    v_max_i16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v10, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v10, v11, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v11, v13, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v13, v15, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v8, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v8, 24, 8
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v7
-; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v4, 8, 8
-; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, v5
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v8, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v6
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v7
+; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, v10
+; GFX12-GISEL-NEXT:    v_max_i16 v5, v5, v11
+; GFX12-GISEL-NEXT:    v_max_i16 v7, v7, v13
+; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v14, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v10, v12, 0, 8
+; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v5
+; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_max_i16 v5, v6, v9
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_max_i16 v4, v4, v10
+; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, v5
-; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, s0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_max_i16 v4, v1, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v5, v2, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_max_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_i16 v2, v2, s0
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX12-GISEL-NEXT:    v_max_i16 v4, v5, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_max_i16 v1, v1, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
@@ -1977,14 +1369,10 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2214,22 +1602,14 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v4i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2249,9 +1629,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2360,39 +1738,22 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v4, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v6, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v7, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v6
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, 0, v1
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, 0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2417,13 +1778,10 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v1, v3
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v4, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v4, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2568,73 +1926,38 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v12, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v4, v4, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v13, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v14, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v6, v6, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 16, 16
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v15, 0, 16
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v4, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 16, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v12
-; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v6
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, 0, v1
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_max_i32_e32 v1, 0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2669,21 +1992,16 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v4, v1, v5
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v5, v2, v6
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v5, v3, v7
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v5, v8, v2
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v4, v3
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v5, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v6, v3, v7
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v5, v8, v5
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v4, v6
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v5, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index 5703b5e66caac..db92e3b401340 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -260,45 +260,21 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v5
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
@@ -321,39 +297,18 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v5, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -380,32 +335,17 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v6, v6, v7, v8
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_or3_b32 v6, v6, v7, v8
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v1, s0, v1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v3, s0, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
@@ -432,45 +372,27 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, s4
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v2, s4
 ; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v5, v5, v6, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_min_i16 v2, v1, s4
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v4, s4
 ; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v5, 24, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 16, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v5
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, s4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v5, v5, v6, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 8, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v5
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_smin_v4i8:
@@ -493,56 +415,35 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, s0
-; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
+; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_min_i16 v4, v2, s0
 ; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_min_i16 v5, v1, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX11-GISEL-NEXT:    v_min_i16 v2, v4, s0
+; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_smin_v4i8:
@@ -573,58 +474,35 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v4, v2, s0
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, s0
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_min_i16 v5, v1, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_min_i16 v2, v4, s0
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v5
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v)
@@ -653,63 +531,29 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 0, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v6, 0, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v7, 0, 8
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v5
-; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
@@ -736,58 +580,27 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v5, v8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v5
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v4, v8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
@@ -817,45 +630,23 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v1, s0, v1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v8, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -887,60 +678,35 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v4, 16, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v4, 24, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, v6
+; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, s4
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v2, s4
 ; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_min_i16 v2, v1, s4
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v4, s4
 ; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, s4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_smin_v8i8:
@@ -971,77 +737,43 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v6, 0, 8
 ; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, v5
+; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, s0
 ; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
+; GFX11-GISEL-NEXT:    v_min_i16 v4, v2, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_min_i16 v5, v1, s0
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, s0
+; GFX11-GISEL-NEXT:    v_min_i16 v2, v4, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_smin_v8i8:
@@ -1080,79 +812,43 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v6, 0, 8
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, v5
+; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, s0
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_min_i16 v4, v2, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v5, v1, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v2, v4, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
@@ -1195,103 +891,45 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v10, v8, 0, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v10
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v10, v8, 8, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v10
-; GFX7-GISEL-NEXT:    v_bfe_i32 v10, v8, 16, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 24, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 0, 8
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v12, 0, 8
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 8, 8
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v13, 0, 8
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 16, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v14, 0, 8
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v6, v6, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 24, 8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v15, 0, 8
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 0, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v10
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 24, 8
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v5
-; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, s4, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, s4, v3
@@ -1326,98 +964,43 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v9, v16, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v9
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v10
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v13, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v12
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v4, sext(v4), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v13
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v5, sext(v5), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v14
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v6, sext(v6), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v7, 8, v7
 ; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v15
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v7, sext(v7), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v5, v16, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v4
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_sdwa v4, v16, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
@@ -1453,67 +1036,31 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX9-GISEL-NEXT:    v_and_or_b32 v9, v12, v16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-GISEL-NEXT:    v_or3_b32 v9, v9, v10, v11
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v5, sext(v5), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v4, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v6, sext(v6), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v7, sext(v7), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
 ; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v3, v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v4, sext(v4), sext(v12) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v3, v3, v7
 ; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX9-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v1, s0, v1
-; GFX9-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v2, s0, v2
 ; GFX9-GISEL-NEXT:    v_min_i16_e32 v3, s0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v16, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1559,94 +1106,51 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX10-GISEL-NEXT:    v_or3_b32 v11, v12, v13, v14
+; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
+; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v11, 8, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v10, v11, 16, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v12, v11, 24, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v11, v11, 0, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v5, v5, v9
-; GFX10-GISEL-NEXT:    v_min_i16 v6, v6, v10
-; GFX10-GISEL-NEXT:    v_min_i16 v7, v7, v12
-; GFX10-GISEL-NEXT:    v_min_i16 v4, v4, v11
-; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v8, 8, 8
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v10, v8, 24, 8
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v8, 16, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, v10
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v8, 0, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, v7
-; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v4, 16, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v8, v4, 24, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v5
+; GFX10-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 8
+; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, v11
+; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v9, v13, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v10, v15, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v11, v14, 0, 8
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, v7
-; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, v8
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
+; GFX10-GISEL-NEXT:    v_bfe_i32 v12, v12, 0, 8
+; GFX10-GISEL-NEXT:    v_min_i16 v5, v5, v9
+; GFX10-GISEL-NEXT:    v_min_i16 v7, v7, v10
+; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, v8
+; GFX10-GISEL-NEXT:    v_min_i16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v4, v12
 ; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v5
+; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, v7
+; GFX10-GISEL-NEXT:    s_sext_i32_i8 s4, s4
+; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX10-GISEL-NEXT:    v_min_i16 v2, v2, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, s4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_min_i16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v2, s4
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_min_i16 v2, v1, s4
 ; GFX10-GISEL-NEXT:    v_min_i16 v3, v3, s4
-; GFX10-GISEL-NEXT:    v_and_or_b32 v6, 0xff, v0, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_min_i16 v4, v4, s4
+; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v6, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v4
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_smin_v16i8:
@@ -1691,113 +1195,58 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v13, v12, 8, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v11, v12, 16, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_min_i16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_bfe_i32 v13, v12, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v12, v12, 0, 8
-; GFX11-GISEL-NEXT:    v_min_i16 v6, v6, v11
-; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v8, 16, 8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v7, v7, v13
-; GFX11-GISEL-NEXT:    v_min_i16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v10, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v10, v11, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v11, v13, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v13, v15, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v8, 8, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v8, 24, 8
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v7
-; GFX11-GISEL-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v7, v4, 8, 8
-; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, v5
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v8, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v6
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v7
+; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, v10
+; GFX11-GISEL-NEXT:    v_min_i16 v5, v5, v11
+; GFX11-GISEL-NEXT:    v_min_i16 v7, v7, v13
+; GFX11-GISEL-NEXT:    v_bfe_i32 v9, v14, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-GISEL-NEXT:    v_bfe_i32 v10, v12, 0, 8
+; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v5
+; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_min_i16 v5, v6, v9
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_min_i16 v4, v4, v10
+; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
 ; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX11-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
+; GFX11-GISEL-NEXT:    v_min_i16 v4, v1, s0
+; GFX11-GISEL-NEXT:    v_min_i16 v5, v2, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_min_i16 v2, v2, s0
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-GISEL-NEXT:    v_min_i16 v4, v5, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_min_i16 v1, v1, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v4
+; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_smin_v16i8:
@@ -1850,115 +1299,58 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v13, v12, 8, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v11, v12, 16, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_min_i16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_bfe_i32 v13, v12, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v12, v12, 0, 8
-; GFX12-GISEL-NEXT:    v_min_i16 v6, v6, v11
-; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v8, 16, 8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_min_i16 v7, v7, v13
-; GFX12-GISEL-NEXT:    v_min_i16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v10, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v10, v11, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v11, v13, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v13, v15, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v8, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v8, 24, 8
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v7
-; GFX12-GISEL-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_bfe_i32 v7, v4, 8, 8
-; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, v5
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v8, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v6
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v7
+; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, v10
+; GFX12-GISEL-NEXT:    v_min_i16 v5, v5, v11
+; GFX12-GISEL-NEXT:    v_min_i16 v7, v7, v13
+; GFX12-GISEL-NEXT:    v_bfe_i32 v9, v14, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 8
 ; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-GISEL-NEXT:    v_bfe_i32 v10, v12, 0, 8
+; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v5
+; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_min_i16 v5, v6, v9
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_min_i16 v4, v4, v10
+; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, v5
-; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, s0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX12-GISEL-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_min_i16 v4, v1, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v5, v2, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_min_i16 v3, v3, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v5, v4, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_min_i16 v2, v2, s0
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX12-GISEL-NEXT:    v_min_i16 v4, v5, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_bfe_i32 v5, v4, 24, 8
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_min_i16 v1, v1, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_bfe_i32 v4, v4, 8, 8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
@@ -1977,14 +1369,10 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2214,22 +1602,14 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v4i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2249,9 +1629,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2360,39 +1738,22 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v4, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v6, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v7, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v6
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, 0, v1
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, 0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2417,13 +1778,10 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v1, v3
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v4, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v4, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2568,73 +1926,38 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v8, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v9, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v12, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v10, 16, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v13, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 0, 16
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v14, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX7-GISEL-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v11, 16, 16
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_bfe_i32 v8, v15, 0, 16
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_bfe_i32 v6, v4, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v4, 16, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 0, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_bfe_i32 v4, v5, 16, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v12
-; GFX7-GISEL-NEXT:    v_bfe_i32 v3, v2, 0, 16
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v6
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_bfe_i32 v2, v2, 16, 16
-; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, 0, v1
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_min_i32_e32 v1, 0, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2669,21 +1992,16 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v4, v1, v5
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v5, v2, v6
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v5, v3, v7
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v5, v8, v2
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v4, v3
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v5, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v6, v3, v7
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v5, v8, v5
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v4, v6
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v5, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index f198a419ff6bc..57e24d4e431aa 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -243,31 +243,16 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, 0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, 0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
@@ -290,35 +275,16 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_umax_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
@@ -342,31 +308,15 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_umax_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v6, v6, v7, v8
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v7, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v6, v6, v7, v8
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
-; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -395,43 +345,26 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_umax_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_max_u16 v4, v2, 0
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX10-GISEL-NEXT:    v_or3_b32 v5, v5, v6, v7
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xff
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_max_u16 v2, v1, 0
+; GFX10-GISEL-NEXT:    v_max_u16 v4, v4, 0
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v5, v5, v6, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v5
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_umax_v4i8:
@@ -455,55 +388,33 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v3
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_max_u16 v4, v2, 0
 ; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_max_u16 v5, v1, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX11-GISEL-NEXT:    v_max_u16 v2, v4, 0
+; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_umax_v4i8:
@@ -535,55 +446,33 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v3
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX12-GISEL-NEXT:    v_max_u16 v4, v2, 0
 ; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_max_u16 v5, v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX12-GISEL-NEXT:    v_max_u16 v2, v4, 0
+; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v)
@@ -612,49 +501,24 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v4
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v6
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_bfe_u32 v6, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v6
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v6
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, 0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, 0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
@@ -681,48 +545,19 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
@@ -752,46 +587,21 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
-; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v8, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -824,61 +634,34 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0xff
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, v7
-; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, v9
+; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, 0
+; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_max_u16 v4, v2, 0
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v7, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    v_and_or_b32 v7, 0xff, v0, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v7, v4, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_max_u16 v2, v1, 0
+; GFX10-GISEL-NEXT:    v_max_u16 v4, v4, 0
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_umax_v8i8:
@@ -909,83 +692,42 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, v5
+; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX11-GISEL-NEXT:    v_max_u16 v4, v2, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, 0
+; GFX11-GISEL-NEXT:    v_max_u16 v5, v1, 0
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v5, v6, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
+; GFX11-GISEL-NEXT:    v_max_u16 v2, v4, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_umax_v8i8:
@@ -1024,83 +766,42 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v5
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, v5
+; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX12-GISEL-NEXT:    v_max_u16 v4, v2, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, 0
+; GFX12-GISEL-NEXT:    v_max_u16 v5, v1, 0
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v5, v6, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
+; GFX12-GISEL-NEXT:    v_max_u16 v2, v4, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
@@ -1143,85 +844,40 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v12
-; GFX7-GISEL-NEXT:    v_bfe_u32 v12, v8, 8, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v8, v8, 16, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v10
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v11
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v10
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v12
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v4, v4, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_bfe_u32 v8, v10, 8, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v13
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX7-GISEL-NEXT:    v_bfe_u32 v8, v10, 16, 8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v10
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v14
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v6, v6, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v7, v7, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v4
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v12
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v6
-; GFX7-GISEL-NEXT:    v_bfe_u32 v6, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v9
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v15
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v7, v7, v8
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, 0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, 0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, 0, v3
@@ -1258,76 +914,27 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v8, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v2, v5
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v4
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
@@ -1365,72 +972,29 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v10, v12, v16, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-GISEL-NEXT:    v_or3_b32 v10, v10, v11, v12
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v8, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, v2, v5
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
-; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; GFX9-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v1, 0, v1
-; GFX9-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v2, 0, v2
 ; GFX9-GISEL-NEXT:    v_max_u16_e32 v3, 0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v16, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -1479,97 +1043,50 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v10
+; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, v11
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v13
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v15
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v13, 0xff
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 8, v12
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v15, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX10-GISEL-NEXT:    v_max_u16 v7, v7, v14
-; GFX10-GISEL-NEXT:    v_max_u16 v6, v6, v15
+; GFX10-GISEL-NEXT:    v_max_u16 v5, v5, v9
+; GFX10-GISEL-NEXT:    v_max_u16 v7, v7, v10
+; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, v8
+; GFX10-GISEL-NEXT:    v_max_u16 v6, v6, v11
 ; GFX10-GISEL-NEXT:    v_max_u16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_max_u16 v5, v5, v10
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v6, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, v9
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, v6
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v6, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, v6
 ; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
+; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
+; GFX10-GISEL-NEXT:    v_max_u16 v4, v2, 0
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_max_u16 v2, v1, 0
 ; GFX10-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-GISEL-NEXT:    v_max_u16 v4, v4, 0
+; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v4
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_umax_v16i8:
@@ -1616,127 +1133,58 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_max_u16 v7, v7, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_u16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_max_u16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v10, v9
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_max_u16 v6, v6, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 24, v8
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, v8
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v5
+; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, v9
+; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, v10
+; GFX11-GISEL-NEXT:    v_max_u16 v5, v5, v11
+; GFX11-GISEL-NEXT:    v_max_u16 v7, v7, v13
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, v8
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v7
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v12
+; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v5
+; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_max_u16 v5, v6, v9
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_max_u16 v4, v4, v10
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, v5
+; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, v6
 ; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
+; GFX11-GISEL-NEXT:    v_max_u16 v4, v1, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX11-GISEL-NEXT:    v_max_u16 v5, v2, 0
 ; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX11-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v5, v6, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX11-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_max_u16 v4, v5, 0
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_umax_v16i8:
@@ -1791,127 +1239,58 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v12
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_max_u16 v7, v7, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_u16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_max_u16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v10, v9
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_max_u16 v6, v6, v13
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 24, v8
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, v8
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v5
+; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, v9
+; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, v10
+; GFX12-GISEL-NEXT:    v_max_u16 v5, v5, v11
+; GFX12-GISEL-NEXT:    v_max_u16 v7, v7, v13
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, v8
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v7
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v12
+; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v5
+; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_max_u16 v5, v6, v9
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_max_u16 v4, v4, v10
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v3
+; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, v5
+; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, v6
 ; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
+; GFX12-GISEL-NEXT:    v_max_u16 v4, v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v3
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX12-GISEL-NEXT:    v_max_u16 v5, v2, 0
 ; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX12-GISEL-NEXT:    v_max_u16 v2, v2, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v5, v6, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX12-GISEL-NEXT:    v_max_u16 v3, v3, 0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_max_u16 v4, v5, 0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v1, v1, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    v_max_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
@@ -1930,13 +1309,10 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2155,20 +1531,14 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v4i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2186,9 +1556,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2297,35 +1665,22 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2348,13 +1703,10 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v1, v3
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v4, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v4, v2
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2499,65 +1851,38 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v10
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v11
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v5, v5, v14
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v4, v4, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v13
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v11
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v6, v6, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v7, v7, v8
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_max_u32_e32 v1, 0, v1
-; GFX7-GISEL-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2590,21 +1915,16 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v4, v1, v5
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_u16_e32 v5, v2, v6
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v5, v3, v7
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v5, v8, v2
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v4, v3
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v5, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_max_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v6, v3, v7
+; GFX8-GISEL-NEXT:    v_max_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v5, v8, v5
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v4, v6
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v2, v5, v2
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_max_u16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index af6e140231728..b44ec6a24e49d 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -243,23 +243,12 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -275,22 +264,8 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -308,21 +283,8 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v6, v2, v3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -344,24 +306,12 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v5, v2, v3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -386,33 +336,15 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -444,33 +376,15 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -500,41 +414,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v4
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v6
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_bfe_u32 v6, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v6
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v6
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -554,36 +447,12 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v3, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -607,34 +476,12 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v3, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -663,42 +510,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0xff
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_min_u16 v2, v2, v7
-; GFX10-GISEL-NEXT:    v_min_u16 v3, v3, v9
-; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v6
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v4
+; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    v_min_u16 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_min_u16 v3, v3, v7
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -731,59 +556,24 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-GISEL-NEXT:    v_min_u16 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_min_u16 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_min_u16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -823,59 +613,24 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX12-GISEL-NEXT:    v_min_u16 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_min_u16 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_min_u16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -919,77 +674,36 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v12
-; GFX7-GISEL-NEXT:    v_bfe_u32 v12, v8, 8, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v8, v8, 16, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v10
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v11
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v10
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v12
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v4, v4, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_bfe_u32 v8, v10, 8, 8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v13
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX7-GISEL-NEXT:    v_bfe_u32 v8, v10, 16, 8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v10
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v14
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v6, v6, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v7, v7, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v4
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v12
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v6
-; GFX7-GISEL-NEXT:    v_bfe_u32 v6, v4, 8, 8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v9
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v0, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX7-GISEL-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v15
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1019,64 +733,20 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v8, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v2, v5
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_and_b32_sdwa v3, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v4
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1108,60 +778,20 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v10, v12, v16, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-GISEL-NEXT:    v_or3_b32 v10, v10, v11, v12
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; GFX9-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v8, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v5, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_e32 v2, v2, v5
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_and_b32_sdwa v3, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_min_u16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1206,78 +836,36 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v13, 0xff
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 8, v12
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v15, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX10-GISEL-NEXT:    v_min_u16 v7, v7, v14
-; GFX10-GISEL-NEXT:    v_min_u16 v6, v6, v15
-; GFX10-GISEL-NEXT:    v_min_u16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_min_u16 v5, v5, v10
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
+; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v11
+; GFX10-GISEL-NEXT:    v_min_u16 v2, v2, v10
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v12
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v13
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v8
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v6, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX10-GISEL-NEXT:    v_min_u16 v3, v3, v9
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
+; GFX10-GISEL-NEXT:    v_min_u16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_min_u16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_min_u16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_min_u16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_min_u16 v7, v7, v12
+; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_min_u16 v2, v2, v6
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v6, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-GISEL-NEXT:    v_min_u16 v2, v2, v6
-; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_min_u16 v3, v3, v7
-; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX10-GISEL-NEXT:    v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1326,101 +914,41 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v8
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
-; GFX11-GISEL-NEXT:    v_or3_b32 v8, v8, v10, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-GISEL-NEXT:    v_min_u16 v7, v7, v14
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-GISEL-NEXT:    v_min_u16 v5, v5, v11
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
-; GFX11-GISEL-NEXT:    v_min_u16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v11
+; GFX11-GISEL-NEXT:    v_min_u16 v2, v2, v10
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v12
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_min_u16 v6, v6, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v13
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v9
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11-GISEL-NEXT:    v_min_u16 v3, v3, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_min_u16 v2, v2, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v9
-; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v7
-; GFX11-GISEL-NEXT:    v_min_u16 v3, v3, v6
-; GFX11-GISEL-NEXT:    v_min_u16 v2, v2, v5
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
+; GFX11-GISEL-NEXT:    v_min_u16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_min_u16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_min_u16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_min_u16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_min_u16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v5
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-GISEL-NEXT:    v_min_u16 v2, v2, v6
+; GFX11-GISEL-NEXT:    v_min_u16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX11-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1476,101 +1004,41 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v8
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
-; GFX12-GISEL-NEXT:    v_or3_b32 v8, v8, v10, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX12-GISEL-NEXT:    v_min_u16 v7, v7, v14
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX12-GISEL-NEXT:    v_min_u16 v5, v5, v11
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v13
-; GFX12-GISEL-NEXT:    v_min_u16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v11
+; GFX12-GISEL-NEXT:    v_min_u16 v2, v2, v10
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v12
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_min_u16 v6, v6, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v13
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v9
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX12-GISEL-NEXT:    v_min_u16 v3, v3, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_min_u16 v2, v2, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v9
-; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v7
-; GFX12-GISEL-NEXT:    v_min_u16 v3, v3, v6
-; GFX12-GISEL-NEXT:    v_min_u16 v2, v2, v5
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
+; GFX12-GISEL-NEXT:    v_min_u16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_min_u16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_min_u16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_min_u16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_min_u16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v5
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-GISEL-NEXT:    v_min_u16 v2, v2, v6
+; GFX12-GISEL-NEXT:    v_min_u16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
+; GFX12-GISEL-NEXT:    v_min_u16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-GISEL-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1590,10 +1058,8 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1813,18 +1279,12 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v4i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1841,9 +1301,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1952,33 +1410,20 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2000,13 +1445,10 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v1, v3
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v4, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v4, v2
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2151,63 +1593,36 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v8
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v10
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v11
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v5, v5, v14
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v4, v4, v8
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v13
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v5, v5, v8
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v11
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v6, v6, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v5
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_min_u32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2239,21 +1654,16 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v4, v1, v5
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_u16_e32 v5, v2, v6
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v5, v3, v7
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v5, v8, v2
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v4, v3
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v5, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-GISEL-NEXT:    v_min_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v6, v3, v7
+; GFX8-GISEL-NEXT:    v_min_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v5, v8, v5
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v4, v6
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v2, v5, v2
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    v_min_u16_e32 v1, v2, v0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-GISEL-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
index 48e596a2289fb..277a63f00c2c6 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
@@ -228,25 +228,8 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -263,20 +246,8 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -293,20 +264,8 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v6, v0, v4, v6
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v6, v2, v3
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v4, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -323,20 +282,8 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v5, 0xff, v0, v5
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v5, v2, v3
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -353,30 +300,9 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -401,30 +327,9 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -449,42 +354,12 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -505,33 +380,12 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -552,32 +406,12 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v8, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v8, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v8, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -598,32 +432,11 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX10-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -645,51 +458,13 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -719,51 +494,13 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX12-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -796,76 +533,20 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -894,59 +575,20 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v11
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v10, v12, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v11
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v10
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -975,56 +617,20 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v17, 8
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v8, v8, v16, v9
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-GISEL-NEXT:    v_or3_b32 v8, v8, v9, v10
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v10, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v10, v12, v16, v10
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v14
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-GISEL-NEXT:    v_or3_b32 v10, v10, v11, v12
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v11
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v10
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v4, v16, v5
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_and_or_b32 v4, v0, v16, v4
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v0, v16, v1
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1053,56 +659,16 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, 8
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX10-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v10
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; GFX10-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v6, v6, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v7, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-GISEL-NEXT:    v_or3_b32 v7, v8, v9, v10
-; GFX10-GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
-; GFX10-GISEL-NEXT:    v_xor3_b32 v0, v0, v7, v4
-; GFX10-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v9
-; GFX10-GISEL-NEXT:    v_xor3_b32 v2, v2, v6, v10
-; GFX10-GISEL-NEXT:    v_xor3_b32 v3, v3, v8, v11
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX10-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX10-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
+; GFX10-GISEL-NEXT:    v_xor3_b32 v3, v3, v11, v7
+; GFX10-GISEL-NEXT:    v_xor3_b32 v0, v0, v8, v4
+; GFX10-GISEL-NEXT:    v_xor3_b32 v2, v2, v10, v6
+; GFX10-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX10-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1133,83 +699,19 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX11-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX11-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-GISEL-NEXT:    v_or3_b32 v5, v8, v10, v11
-; GFX11-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v5, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_xor3_b32 v1, v1, v6, v7
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-GISEL-NEXT:    v_xor3_b32 v2, v2, v8, v10
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v1
-; GFX11-GISEL-NEXT:    v_xor3_b32 v3, v3, v9, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_xor3_b32 v3, v3, v11, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v8, v4
+; GFX11-GISEL-NEXT:    v_xor3_b32 v2, v2, v10, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1248,83 +750,19 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v12, 0xff, v12, v13
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX12-GISEL-NEXT:    v_and_or_b32 v8, 0xff, v8, v9
-; GFX12-GISEL-NEXT:    v_or3_b32 v12, v12, v13, v14
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v12
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v13
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX12-GISEL-NEXT:    v_or3_b32 v5, v8, v10, v11
-; GFX12-GISEL-NEXT:    v_or3_b32 v4, v4, v6, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX12-GISEL-NEXT:    v_xor3_b32 v0, v0, v5, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_xor3_b32 v1, v1, v6, v7
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX12-GISEL-NEXT:    v_xor3_b32 v2, v2, v8, v10
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v7, 0xff, v1
-; GFX12-GISEL-NEXT:    v_xor3_b32 v3, v3, v9, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v4, 0xff, v0, v4
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_or3_b32 v2, v4, v2, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_and_or_b32 v1, 0xff, v0, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_xor3_b32 v3, v3, v11, v7
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_xor3_b32 v0, v0, v8, v4
+; GFX12-GISEL-NEXT:    v_xor3_b32 v2, v2, v10, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX12-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1347,10 +785,10 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v2i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1511,11 +949,8 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) {
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -1597,32 +1032,20 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -1640,17 +1063,8 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1667,13 +1081,10 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_bfi_b32 v2, s0, v1, v1
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_xor_v8i16:
@@ -1687,11 +1098,9 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1709,15 +1118,13 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_xor_v8i16:
@@ -1742,15 +1149,13 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v)
@@ -1782,64 +1187,36 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v12, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v8
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v9
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX7-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v14
+; GFX7-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v12
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v11
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX7-GISEL-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -1861,33 +1238,12 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1908,19 +1264,14 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v5, v1, v5
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v7
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v4, v0, v4
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v6
-; GFX9-GISEL-NEXT:    v_bfi_b32 v1, s0, v1, v1
-; GFX9-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v0
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v5, v1
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v4, v0
-; GFX9-GISEL-NEXT:    v_bfi_b32 v2, s0, v1, v1
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_xor_v16i16:
@@ -1937,17 +1288,13 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1968,23 +1315,18 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_xor_v16i16:
@@ -2012,23 +1354,18 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_bfi_b32 v2, 0xffff, v1, v1
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v)



More information about the llvm-commits mailing list