[llvm] [AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases (PR #124624)

Tue Feb 11 09:46:32 PST 2025

https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/124624

>From dbe224ce6e79e4235ebf2806cb7bf5c2a1608820 Mon Sep 17 00:00:00 2001
From: choikwa <code.kchoi at gmail.com>
Date: Fri, 24 Jan 2025 02:21:38 -0500
Subject: [PATCH] [AMDGPU] Filter candidates of LiveRegOptimizer for profitable
 cases

It is known that for vector whose element fits in i16 will be split
and scalarized in SelectionDag's type legalizer
(see SIISelLowering::getPreferredVectorAction).

LRO attempts to undo the scalarizing of vectors across basic block
boundary and shoehorn Values in VGPRs. LRO is beneficial for operations
that natively work on illegal vector types to prevent flip-flopping
between unpacked and packed. If we know that operations on vector will be
split and scalarized, then we don't want to shoehorn them back to packed
VGPR.

Operations that we know to work natively on illegal vector types
usually come in the form of intrinsics (MFMA, DOT8), buffer store,
shuffle, insert/extract element, phi nodes to name a few.
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 227 +++++++++++++++++-
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 136 +++++++----
 ...dagcomb-extract-vec-elt-different-sizes.ll |  39 ++-
 .../AMDGPU/sdwa-peephole-instr-combine-sel.ll |  15 +-
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  36 ++-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 102 +++++---
 llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll |  39 +--
 7 files changed, 425 insertions(+), 169 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index b8109db821bcc..d7d3d582d28ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -14,12 +14,14 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetTransformInfo.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -45,6 +47,7 @@ class AMDGPULateCodeGenPrepare
   Function &F;
   const DataLayout &DL;
   const GCNSubtarget &ST;
+  const TargetTransformInfo &TTI;
 
   AssumptionCache *const AC;
   UniformityInfo &UA;
@@ -53,8 +56,9 @@ class AMDGPULateCodeGenPrepare
 
 public:
   AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST,
-                           AssumptionCache *AC, UniformityInfo &UA)
-      : F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
+                           const TargetTransformInfo &TTI, AssumptionCache *AC,
+                           UniformityInfo &UA)
+      : F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
   bool run();
   bool visitInstruction(Instruction &) { return false; }
 
@@ -75,6 +79,8 @@ class LiveRegOptimizer {
   Module &Mod;
   const DataLayout &DL;
   const GCNSubtarget &ST;
+  const TargetTransformInfo &TTI;
+
   /// The scalar type to convert to
   Type *const ConvertToScalar;
   /// The set of visited Instructions
@@ -125,8 +131,210 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
-      : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
+  // Filtering based on operation or its cost.
+  // If an operation incurs high enough cost or natively work on
+  // vector of illegal type, ie. v2i8, then it makes sense to try
+  // to coerce them as packed VGPR across BB.
+  bool shouldReplaceByOp(Instruction *II) {
+    static const int SCALARIZE_INST_COST = 2;
+    static const int LRO_COST_THRES = 12;
+
+    // Ignore pseudos
+    if (II->isDebugOrPseudoInst())
+      return false;
+
+    // Instruction Cost
+    auto Cost = TTI.getInstructionCost(
+        II, TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
+    if (const auto *Def = II->getOperand(0)) {
+      if (const auto *DefTy = dyn_cast<FixedVectorType>(Def->getType())) {
+        const auto *ElTy = dyn_cast<IntegerType>(DefTy->getElementType());
+        // Assume vNi8 and vNi16 will be scalarized.
+        if (ElTy && ElTy->getBitWidth() <= 16) {
+          const auto ElCount = DefTy->getElementCount().getFixedValue();
+          Cost += SCALARIZE_INST_COST * ElCount;
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "shouldReplaceByOp: " << *II << " Cost=" << Cost
+                      << '\n';);
+    if (Cost >= LRO_COST_THRES)
+      return true;
+
+    if (isOpLegal(II))
+      return true;
+
+    return false;
+  }
+
+  /// Check if intrinsic natively operates on 8-bit or 16-bit
+  bool isNativeIntrinsic(Intrinsic::ID ID) {
+    switch(ID) {
+    case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
+    case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
+    case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
+    case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
+    case Intrinsic::amdgcn_fdot2_f16_f16:
+    case Intrinsic::amdgcn_fdot2:
+    case Intrinsic::amdgcn_sdot4:
+    case Intrinsic::amdgcn_sdot2:
+    case Intrinsic::amdgcn_sdot8:
+    case Intrinsic::amdgcn_udot2:
+    case Intrinsic::amdgcn_udot4:
+    case Intrinsic::amdgcn_udot8:
+    case Intrinsic::amdgcn_sudot4:
+    case Intrinsic::amdgcn_sudot8:
+    case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
+    case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
+    case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
+    case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
+    case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
+    case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
+    case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
+    case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
+    case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
+    case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
+    case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
+    case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
+    case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
+    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
+    case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
+    case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
+    case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
+    case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+    case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
+    case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
+    case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
+    case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
+    case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
+    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
+    case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
+      return true;
+    default: return false;
+    }
+  }
+
+  bool isOpLegal(Instruction *I) {
+    Type *T = I->getType();
+    if (!TTI.isTypeLegal(T)) {
+      if (const auto Intr = dyn_cast<IntrinsicInst>(I)) {
+        Intrinsic::ID ID = Intr->getIntrinsicID();
+        if (isNativeIntrinsic(ID))
+          return true;
+      }
+      // Stores
+      if (isa<StoreInst>(I))
+        return true;
+      return false;
+    }
+    return true;
+  }
+
+  bool isCoercionProfitable(Instruction *II) {
+    if (shouldReplaceByOp(II))
+      return true;
+
+    // Look through Users
+    bool Profitable = false;
+    SmallPtrSet<Instruction *, 4> CVisited;
+    SmallVector<Instruction *, 4> UserList;
+    for (User *V : II->users())
+      if (auto *UseInst = dyn_cast<Instruction>(V))
+        UserList.push_back(UseInst);
+
+    while(!UserList.empty() && !Profitable) {
+      auto CII = UserList.pop_back_val();
+      if (!CVisited.insert(II).second)
+        continue;
+
+      if (isa<PHINode>(CII) || isa<ShuffleVectorInst>(CII) ||
+          isa<InsertElementInst>(CII) ||
+          isa<ExtractElementInst>(CII))
+        for (User *V : CII->users())
+          if (auto *UseInst = dyn_cast<Instruction>(V))
+            UserList.push_back(UseInst);
+
+      if (CII->getParent() == II->getParent())
+        continue;
+
+      Profitable = shouldReplaceByOp(CII);
+    }
+    return Profitable;
+  }
+
+  LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST,
+                   const TargetTransformInfo &TTI)
+      : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
         ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
 };
 
@@ -140,7 +348,7 @@ bool AMDGPULateCodeGenPrepare::run() {
   // vectors to equivalent vectors of legal type (which are converted back
   // before uses in subsequent blocks), to pack the bits into fewer physical
   // registers (used in CopyToReg/CopyFromReg pairs).
-  LiveRegOptimizer LRO(*F.getParent(), ST);
+  LiveRegOptimizer LRO(*F.getParent(), ST, TTI);
 
   bool Changed = false;
 
@@ -259,6 +467,9 @@ bool LiveRegOptimizer::optimizeLiveType(
     if (!shouldReplace(II->getType()))
       continue;
 
+    if (!isCoercionProfitable(II))
+      continue;
+
     if (PHINode *Phi = dyn_cast<PHINode>(II)) {
       PhiNodes.insert(Phi);
       // Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +689,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
 PreservedAnalyses
 AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
 
   AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
   UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
 
-  bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
+  bool Changed = AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -518,13 +730,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
 
   AssumptionCache &AC =
       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UniformityInfo &UI =
       getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
+  return AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcde..68e32d4d27276 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -6,36 +6,28 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v4, 8
-; GFX906-NEXT:    v_mov_b32_e32 v5, 16
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v3, v2, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v1, v4, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_or3_b32 v3, v6, v7, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v0, v2, s[2:3]
+; GFX906-NEXT:    global_load_dword v1, v4, s[2:3]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_or3_b32 v3, v2, v3, v0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
-; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    v_and_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    global_store_short v1, v0, s[6:7]
@@ -63,19 +55,34 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX906-NEXT:    global_load_dword v1, v5, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v1, v2, s[2:3]
+; GFX906-NEXT:    global_load_dword v1, v5, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX906-NEXT:    v_mov_b32_e32 v5, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -99,28 +106,30 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[2:3]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX906-NEXT:    global_store_byte v4, v1, s[6:7]
-; GFX906-NEXT:    global_store_byte v4, v0, s[6:7] offset:1
-; GFX906-NEXT:    global_store_byte_d16_hi v4, v1, s[6:7] offset:2
-; GFX906-NEXT:    global_store_byte v4, v3, s[6:7] offset:3
-; GFX906-NEXT:    global_store_byte v4, v2, s[6:7] offset:4
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX906-NEXT:    global_store_byte v0, v3, s[6:7] offset:1
+; GFX906-NEXT:    global_store_byte v0, v4, s[6:7] offset:2
+; GFX906-NEXT:    global_store_byte v0, v5, s[6:7] offset:3
+; GFX906-NEXT:    global_store_byte v0, v2, s[6:7] offset:4
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -446,9 +455,21 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX906-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX906-NEXT:    v_mov_b32_e32 v5, 8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX906-NEXT:    v_and_or_b32 v0, v0, v4, v2
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX906-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX906-NEXT:  .LBB7_6: ; %return
 ; GFX906-NEXT:    s_endpgm
@@ -586,32 +607,45 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX906-NEXT:    v_mov_b32_e32 v4, 8
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v1, s[0:1]
 ; GFX906-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v2, 24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX906-NEXT:  .LBB10_1: ; %bb.1
 ; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_and_or_b32 v7, v1, v3, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_or3_b32 v6, v7, v6, v2
 ; GFX906-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX906-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX906-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v6
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index fcb8fa5997b7e..28fcfa0070441 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -8,30 +8,29 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x8
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp0_b32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb10
-; CHECK-NEXT:    global_load_dwordx2 v[8:9], v0, s[12:13]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v0, s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; CHECK-NEXT:    v_bfe_u32 v6, v8, 8, 8
-; CHECK-NEXT:    v_bfe_u32 v5, v8, 16, 8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_and_b32_e32 v3, 0xff, v9
-; CHECK-NEXT:    v_bfe_u32 v2, v9, 8, 8
-; CHECK-NEXT:    v_bfe_u32 v1, v9, 16, 8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_3: ; %bb41
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x48
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s14
@@ -48,16 +47,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
 ; CHECK-NEXT:    v_mov_b32_e32 v19, s25
 ; CHECK-NEXT:    v_mov_b32_e32 v20, s26
 ; CHECK-NEXT:    v_mov_b32_e32 v21, s27
-; CHECK-NEXT:    flat_store_byte v[8:9], v7
-; CHECK-NEXT:    flat_store_byte v[10:11], v6
-; CHECK-NEXT:    flat_store_byte v[12:13], v5
-; CHECK-NEXT:    flat_store_byte v[14:15], v4
-; CHECK-NEXT:    flat_store_byte v[16:17], v3
-; CHECK-NEXT:    flat_store_byte v[18:19], v2
-; CHECK-NEXT:    flat_store_byte v[20:21], v1
+; CHECK-NEXT:    flat_store_byte v[8:9], v0
+; CHECK-NEXT:    flat_store_byte v[10:11], v7
+; CHECK-NEXT:    flat_store_byte v[12:13], v6
+; CHECK-NEXT:    flat_store_byte v[14:15], v5
+; CHECK-NEXT:    flat_store_byte v[16:17], v1
+; CHECK-NEXT:    flat_store_byte v[18:19], v4
+; CHECK-NEXT:    flat_store_byte v[20:21], v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; CHECK-NEXT:    flat_store_byte v[2:3], v0
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
 ; CHECK-NEXT:    s_endpgm
 bb:
   br i1 %arg, label %bb10, label %bb41
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
index 6eae905278f3e..883a6b70a5a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
@@ -13,33 +13,24 @@ define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-NEXT:    s_load_dword s2, s[8:9], 0x8
-; CHECK-NEXT:    v_mov_b32_e32 v2, 8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_ushort v1, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2
+; CHECK-NEXT:    global_load_sbyte v0, v0, s[0:1] offset:2
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
 ; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; CHECK-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb19
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    ds_write_b32 v1, v1
 ; CHECK-NEXT:  .LBB0_2: ; %bb20
-; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; CHECK-NEXT:    s_mov_b32 s0, exec_lo
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmpx_ne_u16_e32 0, v0
 ; CHECK-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_4
 ; CHECK-NEXT:  ; %bb.3: ; %bb11
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; CHECK-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    ds_write_b32 v0, v1 offset:84
 ; CHECK-NEXT:  .LBB0_4: ; %bb14
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 459ef648fd806..dcfcffb50c188 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2102,15 +2102,18 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA:       ; %bb.0: ; %bb0
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 0
+; NOSDWA-NEXT:    s_and_b32 s6, s4, 0xff
+; NOSDWA-NEXT:    s_bitset1_b32 s6, 8
+; NOSDWA-NEXT:    s_and_b32 s6, s6, 0x1ff
 ; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
 ; NOSDWA-NEXT:  .LBB22_1: ; %bb1
 ; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
-; NOSDWA-NEXT:    s_lshl_b32 s6, s4, 3
+; NOSDWA-NEXT:    s_lshl_b32 s7, s4, 3
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, s4
-; NOSDWA-NEXT:    s_lshr_b32 s6, 0x100, s6
+; NOSDWA-NEXT:    s_lshr_b32 s7, s6, s7
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s5
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 1
-; NOSDWA-NEXT:    v_mov_b32_e32 v2, s6
+; NOSDWA-NEXT:    v_mov_b32_e32 v2, s7
 ; NOSDWA-NEXT:    flat_store_byte v[0:1], v2
 ; NOSDWA-NEXT:    s_mov_b64 vcc, vcc
 ; NOSDWA-NEXT:    s_cbranch_vccnz .LBB22_1
@@ -2122,15 +2125,18 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX89:       ; %bb.0: ; %bb0
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 0
+; GFX89-NEXT:    s_and_b32 s6, s4, 0xff
+; GFX89-NEXT:    s_bitset1_b32 s6, 8
+; GFX89-NEXT:    s_and_b32 s6, s6, 0x1ff
 ; GFX89-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX89-NEXT:  .LBB22_1: ; %bb1
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX89-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX89-NEXT:    s_lshl_b32 s7, s4, 3
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    s_lshr_b32 s6, 0x100, s6
+; GFX89-NEXT:    s_lshr_b32 s7, s6, s7
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 1
-; GFX89-NEXT:    v_mov_b32_e32 v2, s6
+; GFX89-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX89-NEXT:    flat_store_byte v[0:1], v2
 ; GFX89-NEXT:    s_mov_b64 vcc, vcc
 ; GFX89-NEXT:    s_cbranch_vccnz .LBB22_1
@@ -2142,15 +2148,18 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX9:       ; %bb.0: ; %bb0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_and_b32 s6, s4, 0xff
+; GFX9-NEXT:    s_bitset1_b32 s6, 8
+; GFX9-NEXT:    s_and_b32 s6, s6, 0x1ff
 ; GFX9-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX9-NEXT:  .LBB22_1: ; %bb1
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX9-NEXT:    s_lshl_b32 s7, s4, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    s_lshr_b32 s6, 0x100, s6
+; GFX9-NEXT:    s_lshr_b32 s7, s6, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-NEXT:    flat_store_byte v[0:1], v2
 ; GFX9-NEXT:    s_mov_b64 vcc, vcc
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB22_1
@@ -2161,14 +2170,17 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX10:       ; %bb.0: ; %bb0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX10-NEXT:    s_or_b32 s6, s4, 0x100
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    s_and_b32 s6, s6, 0x1ff
 ; GFX10-NEXT:  .LBB22_1: ; %bb1
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_lshl_b32 s6, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s7, s4, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_lshr_b32 s4, 0x100, s6
+; GFX10-NEXT:    s_lshr_b32 s4, s6, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX10-NEXT:    flat_store_byte v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b5e4bcd049c42..dbf789ffb0a89 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,31 +6,27 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
-; GFX906-NEXT:    s_mov_b32 s4, 0xff0000
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v4, v2, s[0:1]
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_load_dword v2, v5, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX906-NEXT:    v_and_or_b32 v4, v4, s4, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v0, v2, s[2:3]
+; GFX906-NEXT:    global_load_dword v2, v5, s[2:3]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX906-NEXT:    v_and_or_b32 v4, v0, s4, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX906-NEXT:  .LBB0_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    global_store_byte_d16_hi v1, v4, s[6:7] offset:2
-; GFX906-NEXT:    global_store_short v1, v4, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_byte v1, v3, s[6:7] offset:2
+; GFX906-NEXT:    global_store_short v1, v0, s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -54,19 +50,31 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v3, s[0:1]
+; GFX906-NEXT:    global_load_dword v2, v6, s[0:1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v3, s[2:3]
+; GFX906-NEXT:    global_load_dword v2, v6, s[2:3]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX906-NEXT:  .LBB1_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v2, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,23 +98,32 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[2:3]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[2:3]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    global_store_byte v3, v2, s[6:7] offset:4
-; GFX906-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_byte v5, v2, s[6:7] offset:4
+; GFX906-NEXT:    global_store_dword v5, v0, s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -373,9 +390,17 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX906-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dword v4, v0, s[6:7]
 ; GFX906-NEXT:  .LBB7_6: ; %return
 ; GFX906-NEXT:    s_endpgm
 entry:
@@ -685,22 +710,27 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
 ; GFX906-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v0
-; GFX906-NEXT:    s_mov_b32 s2, 0x2000604
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v1, s[0:1]
 ; GFX906-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v0, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:  .LBB12_1: ; %bb.1
 ; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    s_and_b64 s[6:7], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
-; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s2
+; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
+; GFX906-NEXT:    v_mov_b32_e32 v3, v2
+; GFX906-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX906-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
index a7f9a4c51aa75..414c8841607e8 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
@@ -8,20 +8,14 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
 ; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
 ; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
 ; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
 ; GFX906:       bb.1:
 ; GFX906-NEXT:    br label [[BB_2]]
 ; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
-; GFX906-NEXT:    [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
+; GFX906-NEXT:    [[TMP3:%.*]] = phi <3 x i8> [ [[VEC1]], [[ENTRY:%.*]] ], [ [[VEC2]], [[BB_1]] ]
 ; GFX906-NEXT:    store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
 ; GFX906-NEXT:    ret void
 ;
@@ -49,17 +43,14 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
 ; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
 ; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
 ; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
 ; GFX906:       bb.1:
 ; GFX906-NEXT:    br label [[BB_2]]
 ; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = phi <4 x i8> [ [[VEC1]], [[ENTRY:%.*]] ], [ [[VEC2]], [[BB_1]] ]
 ; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
 ; GFX906-NEXT:    ret void
 ;
@@ -87,20 +78,14 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT:    [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 ; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT:    [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
 ; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
 ; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
 ; GFX906:       bb.1:
 ; GFX906-NEXT:    br label [[BB_2]]
 ; GFX906:       bb.2:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+; GFX906-NEXT:    [[TMP3:%.*]] = phi <5 x i8> [ [[VEC1]], [[ENTRY:%.*]] ], [ [[VEC2]], [[BB_1]] ]
 ; GFX906-NEXT:    store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
 ; GFX906-NEXT:    ret void
 ;
@@ -166,10 +151,8 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
 ; GFX906-NEXT:    [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
 ; GFX906-NEXT:    switch i32 [[IN]], label [[RETURN:%.*]] [
 ; GFX906-NEXT:      i32 1, label [[RETURN_SINK_SPLIT:%.*]]
 ; GFX906-NEXT:      i32 2, label [[RETURN_SINK_SPLIT]]
@@ -178,8 +161,7 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
 ; GFX906:       sw.bb5:
 ; GFX906-NEXT:    br label [[RETURN_SINK_SPLIT]]
 ; GFX906:       return.sink.split:
-; GFX906-NEXT:    [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
-; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
+; GFX906-NEXT:    [[TMP5_TC_BC:%.*]] = phi <4 x i8> [ [[VEC2]], [[SW_BB5]] ], [ [[VEC1]], [[ENTRY:%.*]] ], [ [[VEC1]], [[ENTRY]] ]
 ; GFX906-NEXT:    store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
 ; GFX906-NEXT:    ret void
 ; GFX906:       return:
@@ -314,14 +296,11 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
 ; GFX906-NEXT:    [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX906-NEXT:    [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
 ; GFX906-NEXT:    [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT:    [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
 ; GFX906-NEXT:    br label [[BB_1:%.*]]
 ; GFX906:       bb.1:
-; GFX906-NEXT:    [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
-; GFX906-NEXT:    [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
-; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
-; GFX906-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; GFX906-NEXT:    [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT:    [[TEMP:%.*]] = phi <4 x i8> [ [[VEC1]], [[ENTRY:%.*]] ], [ [[VEC2:%.*]], [[BB_1]] ]
+; GFX906-NEXT:    [[VEC2]] = shufflevector <4 x i8> [[VEC1]], <4 x i8> [[TEMP]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; GFX906-NEXT:    [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
 ; GFX906-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
 ; GFX906-NEXT:    br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
 ; GFX906:       0:
@@ -402,12 +381,10 @@ define amdgpu_kernel void @reuseOp() {
 ; GFX906:       bb.1:
 ; GFX906-NEXT:    [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8>
 ; GFX906-NEXT:    [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
-; GFX906-NEXT:    [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32>
 ; GFX906-NEXT:    [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]]
 ; GFX906-NEXT:    br label [[BB_2:%.*]]
 ; GFX906:       bb.2:
-; GFX906-NEXT:    [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8>
-; GFX906-NEXT:    [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0
+; GFX906-NEXT:    [[VAL:%.*]] = extractelement <16 x i8> [[SEL0]], i64 0
 ; GFX906-NEXT:    ret void
 ;
 entry: