[llvm] 381ded3 - [AMDGPU] Add S_MOV_B64_IMM_PSEUDO for wide constants
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 30 11:45:47 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-06-30T11:45:38-07:00
New Revision: 381ded345bdd59edcea9a87f399f67b18be7622a
URL: https://github.com/llvm/llvm-project/commit/381ded345bdd59edcea9a87f399f67b18be7622a
DIFF: https://github.com/llvm/llvm-project/commit/381ded345bdd59edcea9a87f399f67b18be7622a.diff
LOG: [AMDGPU] Add S_MOV_B64_IMM_PSEUDO for wide constants
This is to allow 64 bit constant rematerialization. If a constant
is split into two separate moves initializing sub0 and sub1 like
now RA cannot rematerizalize a 64 bit register.
This gives 10-20% uplift in a set of huge apps heavily using double
precession math.
Fixes: SWDEV-292645
Differential Revision: https://reviews.llvm.org/D104874
Added:
llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
llvm/test/CodeGen/AMDGPU/ds_write2.ll
llvm/test/CodeGen/AMDGPU/inline-asm.ll
llvm/test/CodeGen/AMDGPU/inline-constraints.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
llvm/test/CodeGen/AMDGPU/packed-fp32.ll
llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/shift-i128.ll
llvm/test/CodeGen/AMDGPU/shl.ll
llvm/test/CodeGen/AMDGPU/sopk-compares.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/CodeGen/AMDGPU/udiv64.ll
llvm/test/CodeGen/AMDGPU/urem64.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index fa3c7e657fc1f..49ea80a6dd671 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -75,6 +75,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
ModulePass *createAMDGPULowerModuleLDSPass();
FunctionPass *createSIModeRegisterPass();
+FunctionPass *createGCNPreRAOptimizationsPass();
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
@@ -348,6 +349,9 @@ extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;
+void initializeGCNPreRAOptimizationsPass(PassRegistry &);
+extern char &GCNPreRAOptimizationsID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 82d0f832f6a61..a34449c57b3d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -208,6 +208,11 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnablePreRAOptimizations(
+ "amdgpu-enable-pre-ra-optimizations",
+ cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -275,6 +280,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNNSAReassignPass(*PR);
+ initializeGCNPreRAOptimizationsPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1191,6 +1197,11 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ if (EnablePreRAOptimizations.getNumOccurrences()
+ ? EnablePreRAOptimizations
+ : TM->getOptLevel() > CodeGenOpt::Less)
+ insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
+
// This is not an essential optimization and it has a noticeable impact on
// compilation time, so we only enable it from O2.
if (TM->getOptLevel() > CodeGenOpt::Less)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 0e3ea8d313a26..21a6e39781f0e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -143,6 +143,7 @@ add_llvm_target(AMDGPUCodeGen
GCNILPSched.cpp
GCNNSAReassign.cpp
GCNDPPCombine.cpp
+ GCNPreRAOptimizations.cpp
SIModeRegister.cpp
LINK_COMPONENTS
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
new file mode 100644
index 0000000000000..a51399d7da5f8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -0,0 +1,162 @@
+//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass combines split register tuple initialization into a single psuedo:
+///
+/// undef %0.sub1:sreg_64 = S_MOV_B32 1
+/// %0.sub0:sreg_64 = S_MOV_B32 2
+/// =>
+/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
+///
+/// This is to allow rematerialization of a value instead of spilling. It is
+/// supposed to be done after register coalescer to allow it to do its job and
+/// before actual register allocation to allow rematerialization.
+///
+/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
+/// although the same shall be possible with other register classes and
+/// instructions if necessary.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
+
+namespace {
+
+class GCNPreRAOptimizations : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+ bool processReg(Register Reg);
+
+public:
+ static char ID;
+
+ GCNPreRAOptimizations() : MachineFunctionPass(ID) {
+ initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Pre-RA optimizations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE,
+ "AMDGPU Pre-RA optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations",
+ false, false)
+
+char GCNPreRAOptimizations::ID = 0;
+
+char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID;
+
+FunctionPass *llvm::createGCNPreRAOptimizationsPass() {
+ return new GCNPreRAOptimizations();
+}
+
+bool GCNPreRAOptimizations::processReg(Register Reg) {
+ MachineInstr *Def0 = nullptr;
+ MachineInstr *Def1 = nullptr;
+ uint64_t Init = 0;
+
+ for (MachineInstr &I : MRI->def_instructions(Reg)) {
+ if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
+ !I.getOperand(1).isImm() || I.getNumOperands() != 2)
+ return false;
+
+ switch (I.getOperand(0).getSubReg()) {
+ default:
+ return false;
+ case AMDGPU::sub0:
+ if (Def0)
+ return false;
+ Def0 = &I;
+ Init |= I.getOperand(1).getImm() & 0xffffffff;
+ break;
+ case AMDGPU::sub1:
+ if (Def1)
+ return false;
+ Def1 = &I;
+ Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+ break;
+ }
+ }
+
+ if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
+ << " =>\n");
+
+ if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
+ LIS->getInstructionIndex(*Def0)))
+ std::swap(Def0, Def1);
+
+ LIS->RemoveMachineInstrFromMaps(*Def0);
+ LIS->RemoveMachineInstrFromMaps(*Def1);
+ auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
+ .addImm(Init);
+
+ Def0->eraseFromParent();
+ Def1->eraseFromParent();
+ LIS->InsertMachineInstrInMaps(*NewI);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+
+ LLVM_DEBUG(dbgs() << " " << *NewI);
+
+ return true;
+}
+
+bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ LIS = &getAnalysis<LiveIntervals>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+
+ for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
+ continue;
+ Changed |= processReg(Reg);
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4084619240c54..7fd275bd0ade9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1765,6 +1765,30 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
expandMovDPP64(MI);
break;
}
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
+ const MachineOperand &SrcOp = MI.getOperand(1);
+ assert(!SrcOp.isFPImm());
+ APInt Imm(64, SrcOp.getImm());
+ if (Imm.isIntN(32) || isInlineConstant(Imm)) {
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+ APInt Lo(32, Imm.getLoBits(32).getZExtValue());
+ APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
+ .addImm(Lo.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
+ .addImm(Hi.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ad6d69468ec91..1d086d14a00cc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -111,6 +111,18 @@ def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
}
+// 64-bit scalar move immediate instruction. This is used to avoid subregs
+// initialization and allow rematerialization.
+def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
+ (ins i64imm:$src0)> {
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+ let isMoveImm = 1;
+ let SchedRW = [WriteSALU, Write64Bit];
+ let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
+ let Uses = [];
+}
+
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index cbca2a96f1d40..660eb5bd537cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1097,11 +1097,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: v_and_b32_e32 v3, s6, v3
+; SI-NEXT: s_movk_i32 s5, 0x80
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_movk_i32 s5, 0x80
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; SI-NEXT: v_and_b32_e32 v1, 1, v0
@@ -1129,11 +1129,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_and_b32_e32 v3, s6, v3
+; VI-NEXT: s_movk_i32 s5, 0x80
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
-; VI-NEXT: s_mov_b32 s4, 0
-; VI-NEXT: s_movk_i32 s5, 0x80
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; VI-NEXT: v_and_b32_e32 v1, 1, v0
@@ -1165,10 +1165,10 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; SI-NEXT: v_and_b32_e32 v3, s4, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_movk_i32 s5, 0x80
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; SI-NEXT: v_and_b32_e32 v1, 1, v0
@@ -1195,10 +1195,10 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; VI-NEXT: v_and_b32_e32 v3, s4, v3
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_movk_i32 s5, 0x80
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; VI-NEXT: v_and_b32_e32 v1, 1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index a944adb4375e9..2dedb531bc1bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2751,9 +2751,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8
; GPRIDX-NEXT: s_mov_b32 s0, 0
+; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000
; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000
; GPRIDX-NEXT: s_mov_b32 s2, s0
-; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
@@ -2842,9 +2842,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8
; MOVREL-NEXT: s_mov_b32 s0, 0
+; MOVREL-NEXT: s_mov_b32 s1, 0x40140000
; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_mov_b32 s2, s0
-; MOVREL-NEXT: s_mov_b32 s1, 0x40140000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
@@ -2935,9 +2935,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: s_mov_b32 s3, 0x40140000
; GFX10-NEXT: s_mov_b32 s5, 0x40080000
; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x40140000
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s8, 1
@@ -3837,21 +3837,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0
; GPRIDX-NEXT: .end_amd_kernel_code_t
; GPRIDX-NEXT: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8
-; GPRIDX-NEXT: s_mov_b32 s0, 0
-; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000
+; GPRIDX-NEXT: s_mov_b32 s2, 0
+; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000
; GPRIDX-NEXT: v_mov_b32_e32 v2, 0
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2
-; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3
-; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
-; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s2
+; GPRIDX-NEXT: v_mov_b32_e32 v1, s3
+; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_extract_v4f64_s_s_s:
@@ -3924,21 +3924,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; MOVREL-NEXT: runtime_loader_kernel_symbol = 0
; MOVREL-NEXT: .end_amd_kernel_code_t
; MOVREL-NEXT: ; %bb.0: ; %entry
-; MOVREL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8
-; MOVREL-NEXT: s_mov_b32 s0, 0
-; MOVREL-NEXT: s_mov_b32 s1, 0x40080000
+; MOVREL-NEXT: s_mov_b32 s2, 0
+; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v2, s2
+; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
-; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: s_cmp_eq_u32 s6, 3
-; MOVREL-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
-; MOVREL-NEXT: v_mov_b32_e32 v0, s0
-; MOVREL-NEXT: v_mov_b32_e32 v1, s1
-; MOVREL-NEXT: v_mov_b32_e32 v3, s3
+; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
+; MOVREL-NEXT: v_mov_b32_e32 v0, s2
+; MOVREL-NEXT: v_mov_b32_e32 v1, s3
+; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; MOVREL-NEXT: s_endpgm
;
@@ -4078,8 +4078,7 @@ define i32 @v_extract_v64i32_32(<64 x i32> addrspace(1)* %ptr) {
; MOVREL-LABEL: v_extract_v64i32_32:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: s_movk_i32 s4, 0x80
-; MOVREL-NEXT: s_mov_b32 s5, 0
+; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@@ -4112,8 +4111,7 @@ define i32 @v_extract_v64i32_33(<64 x i32> addrspace(1)* %ptr) {
; MOVREL-LABEL: v_extract_v64i32_33:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: s_movk_i32 s4, 0x80
-; MOVREL-NEXT: s_mov_b32 s5, 0
+; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@@ -4140,8 +4138,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_37:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: s_movk_i32 s4, 0x80
-; GPRIDX-NEXT: s_mov_b32 s5, 0
+; GPRIDX-NEXT: s_mov_b64 s[4:5], 0x80
; GPRIDX-NEXT: v_mov_b32_e32 v2, s4
; GPRIDX-NEXT: v_mov_b32_e32 v3, s5
; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
@@ -4154,8 +4151,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; MOVREL-LABEL: v_extract_v64i32_37:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: s_movk_i32 s4, 0x80
-; MOVREL-NEXT: s_mov_b32 s5, 0
+; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@@ -4171,8 +4167,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_movk_i32 s4, 0x80
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b64 s[4:5], 0x80
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
index a4f5948da2c69..d7ea1404b1175 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
@@ -7,8 +7,8 @@ define double @v_floor_f64_ieee(double %x) {
; GFX6-LABEL: v_floor_f64_ieee:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -30,8 +30,8 @@ define double @v_floor_f64_ieee_nnan(double %x) {
; GFX6-LABEL: v_floor_f64_ieee_nnan:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
@@ -50,8 +50,8 @@ define double @v_floor_f64_ieee_fneg(double %x) {
; GFX6-LABEL: v_floor_f64_ieee_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -74,8 +74,8 @@ define double @v_floor_f64_nonieee(double %x) #1 {
; GFX6-LABEL: v_floor_f64_nonieee:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -97,8 +97,8 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
; GFX6-LABEL: v_floor_f64_nonieee_nnan:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
@@ -117,8 +117,8 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
; GFX6-LABEL: v_floor_f64_non_ieee_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -141,8 +141,8 @@ define double @v_floor_f64_fabs(double %x) {
; GFX6-LABEL: v_floor_f64_fabs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]|
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]|
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -170,8 +170,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
; GFX6-LABEL: v_floor_f64_fneg_fabs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]|
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]|
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -194,8 +194,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
; GFX6-LABEL: s_floor_f64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3]
; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3]
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
; GFX6-LABEL: s_floor_f64_fneg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3]
; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3]
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
; GFX6-LABEL: s_floor_f64_fabs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]|
; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]|
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
; GFX6-LABEL: s_floor_f64_fneg_fabs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]|
; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]|
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index cbeb0140efd00..e852f9a4a4c18 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4703,8 +4703,7 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: s_fshl_i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s10, 0x7f
-; GFX6-NEXT: s_mov_b32 s11, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_sub_i32 s9, s12, 64
@@ -4751,8 +4750,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX8-LABEL: s_fshl_i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s10, 0x7f
-; GFX8-NEXT: s_mov_b32 s11, 0
+; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_sub_i32 s9, s12, 64
@@ -4799,8 +4797,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX9-LABEL: s_fshl_i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s10, 0x7f
-; GFX9-NEXT: s_mov_b32 s11, 0
+; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_sub_i32 s9, s12, 64
@@ -4847,8 +4844,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX10-LABEL: s_fshl_i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_movk_i32 s10, 0x7f
-; GFX10-NEXT: s_mov_b32 s11, 0
+; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX10-NEXT: s_sub_i32 s9, s12, 64
@@ -5321,8 +5317,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshl_i128_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s6, 0x7f
-; GFX6-NEXT: s_mov_b32 s7, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s5, s8, 64
@@ -5379,8 +5374,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX8-LABEL: v_fshl_i128_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s6, 0x7f
-; GFX8-NEXT: s_mov_b32 s7, 0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s5, s8, 64
@@ -5437,8 +5431,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX9-LABEL: v_fshl_i128_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s6, 0x7f
-; GFX9-NEXT: s_mov_b32 s7, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s5, s8, 64
@@ -5495,8 +5488,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX10-LABEL: v_fshl_i128_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_movk_i32 s6, 0x7f
-; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
@@ -5556,8 +5548,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshl_i128_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s6, 0x7f
-; GFX6-NEXT: s_mov_b32 s7, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s6, 64, s8
@@ -5612,8 +5603,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX8-LABEL: v_fshl_i128_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s6, 0x7f
-; GFX8-NEXT: s_mov_b32 s7, 0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s6, 64, s8
@@ -5668,8 +5658,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX9-LABEL: v_fshl_i128_vss:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s6, 0x7f
-; GFX9-NEXT: s_mov_b32 s7, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s6, 64, s8
@@ -5724,8 +5713,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX10-LABEL: v_fshl_i128_vss:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_movk_i32 s6, 0x7f
-; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX10-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5]
; GFX10-NEXT: s_sub_i32 s4, 64, s8
@@ -5902,8 +5890,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshl_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s18, 0x7f
-; GFX6-NEXT: s_mov_b32 s19, 0
+; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX6-NEXT: s_sub_i32 s17, s22, 64
@@ -5991,8 +5978,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX8-LABEL: s_fshl_v2i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s18, 0x7f
-; GFX8-NEXT: s_mov_b32 s19, 0
+; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX8-NEXT: s_sub_i32 s17, s22, 64
@@ -6080,8 +6066,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX9-LABEL: s_fshl_v2i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s18, 0x7f
-; GFX9-NEXT: s_mov_b32 s19, 0
+; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX9-NEXT: s_sub_i32 s17, s22, 64
@@ -6169,8 +6154,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX10-LABEL: s_fshl_v2i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_movk_i32 s18, 0x7f
-; GFX10-NEXT: s_mov_b32 s19, 0
+; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX10-NEXT: s_sub_i32 s17, s22, 64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index b8a83aac36044..743fe18c5da97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4840,8 +4840,7 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: s_fshr_i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s10, 0x7f
-; GFX6-NEXT: s_mov_b32 s11, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_sub_i32 s9, 64, 1
@@ -4888,8 +4887,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX8-LABEL: s_fshr_i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s10, 0x7f
-; GFX8-NEXT: s_mov_b32 s11, 0
+; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_sub_i32 s9, 64, 1
@@ -4936,8 +4934,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX9-LABEL: s_fshr_i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s10, 0x7f
-; GFX9-NEXT: s_mov_b32 s11, 0
+; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_sub_i32 s9, 64, 1
@@ -4984,8 +4981,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX10-LABEL: s_fshr_i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_movk_i32 s10, 0x7f
-; GFX10-NEXT: s_mov_b32 s11, 0
+; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX10-NEXT: s_sub_i32 s13, 64, 1
@@ -5458,8 +5454,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshr_i128_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s6, 0x7f
-; GFX6-NEXT: s_mov_b32 s7, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s5, 64, 1
@@ -5515,8 +5510,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX8-LABEL: v_fshr_i128_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s6, 0x7f
-; GFX8-NEXT: s_mov_b32 s7, 0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s5, 64, 1
@@ -5572,8 +5566,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX9-LABEL: v_fshr_i128_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s6, 0x7f
-; GFX9-NEXT: s_mov_b32 s7, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s5, 64, 1
@@ -5629,8 +5622,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX10-LABEL: v_fshr_i128_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_movk_i32 s6, 0x7f
-; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX10-NEXT: s_sub_i32 s9, 64, 1
@@ -5689,8 +5681,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshr_i128_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s6, 0x7f
-; GFX6-NEXT: s_mov_b32 s7, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s5, 64, 1
@@ -5746,8 +5737,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX8-LABEL: v_fshr_i128_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s6, 0x7f
-; GFX8-NEXT: s_mov_b32 s7, 0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s5, 64, 1
@@ -5803,8 +5793,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX9-LABEL: v_fshr_i128_vss:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s6, 0x7f
-; GFX9-NEXT: s_mov_b32 s7, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s5, 64, 1
@@ -5863,19 +5852,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: s_sub_i32 s6, 64, 1
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT: s_movk_i32 s6, 0x7f
-; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5]
; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX10-NEXT: s_sub_i32 s4, 64, s8
; GFX10-NEXT: v_or_b32_e32 v2, v4, v2
; GFX10-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX10-NEXT: s_sub_i32 s4, 64, s8
; GFX10-NEXT: s_sub_i32 s5, s8, 64
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
; GFX10-NEXT: s_cmp_eq_u32 s8, 0
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1]
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
@@ -6044,8 +6032,7 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s18, 0x7f
-; GFX6-NEXT: s_mov_b32 s19, 0
+; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX6-NEXT: s_sub_i32 s28, 64, 1
; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
@@ -6133,8 +6120,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX8-LABEL: s_fshr_v2i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s18, 0x7f
-; GFX8-NEXT: s_mov_b32 s19, 0
+; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX8-NEXT: s_sub_i32 s28, 64, 1
; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
@@ -6222,8 +6208,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX9-LABEL: s_fshr_v2i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s18, 0x7f
-; GFX9-NEXT: s_mov_b32 s19, 0
+; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX9-NEXT: s_sub_i32 s28, 64, 1
; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
@@ -6311,13 +6296,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX10-LABEL: s_fshr_v2i128:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX10-NEXT: s_sub_i32 s28, 64, 1
-; GFX10-NEXT: s_movk_i32 s18, 0x7f
-; GFX10-NEXT: s_mov_b32 s19, 0
-; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
; GFX10-NEXT: s_or_b64 s[2:3], s[24:25], s[2:3]
; GFX10-NEXT: s_sub_i32 s23, s16, 64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 4754bd208e52a..1fa75feb9d83b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -19,8 +19,7 @@ define i32 @global_atomic_csub_offset(i32 addrspace(1)* %ptr, i32 %data) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-NEXT: s_movk_i32 s4, 0x1000
-; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0x1000
; GCN-NEXT: v_mov_b32_e32 v3, s4
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@@ -50,8 +49,7 @@ define void @global_atomic_csub_offset_nortn(i32 addrspace(1)* %ptr, i32 %data)
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-NEXT: s_movk_i32 s4, 0x1000
-; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0x1000
; GCN-NEXT: v_mov_b32_e32 v3, s4
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index b5332e25dffd0..a0f02fad31b95 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -336,8 +336,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s4, 0x400
-; GFX6-NEXT: s_mov_b32 s5, 0
+; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
@@ -352,8 +351,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_movk_i32 s4, 0x400
-; GFX7-NEXT: s_mov_b32 s5, 0
+; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
@@ -792,8 +790,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_movk_i32 s4, 0x400
-; GFX6-NEXT: s_mov_b32 s5, 0
+; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX6-NEXT: v_mov_b32_e32 v3, s5
@@ -808,8 +805,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_movk_i32 s4, 0x400
-; GFX7-NEXT: s_mov_b32 s5, 0
+; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX7-NEXT: v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 6ba213c313829..472b315bc626d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -568,8 +568,8 @@ define double @v_roundeven_f64_fneg(double %x) {
; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4
; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4]
-; GFX6-NEXT: v_mov_b32_e32 v1, v0
; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_mov_b32_e32 v1, v0
; GFX6-NEXT: s_mov_b32 s5, 0x432fffff
; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4]
; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index a899655cfd96e..d82d0ce5637dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2509,8 +2509,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s4, 0x1000
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
@@ -2703,8 +2702,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_movk_i32 s6, 0x1000
-; GISEL-NEXT: s_mov_b32 s7, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
@@ -2996,8 +2994,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: s_movk_i32 s4, 0x1000
-; CGP-NEXT: s_mov_b32 s5, 0
+; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_mov_b32_e32 v7, v1
; CGP-NEXT: v_mov_b32_e32 v5, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 2ae38a64fe34a..1b7184742e5dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -2473,8 +2473,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_srem_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s4, 0x1000
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
@@ -2663,8 +2662,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_movk_i32 s6, 0x1000
-; GISEL-NEXT: s_mov_b32 s7, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
@@ -2952,8 +2950,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: s_movk_i32 s4, 0x1000
-; CGP-NEXT: s_mov_b32 s5, 0
+; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_mov_b32_e32 v7, v1
; CGP-NEXT: v_mov_b32_e32 v5, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 4ee838f942b39..82f35daae56c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2291,8 +2291,7 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_udiv_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s4, 0x1000
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
; CHECK-NEXT: v_mov_b32_e32 v2, 0
@@ -2470,8 +2469,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_udiv_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_movk_i32 s4, 0x1000
-; GISEL-NEXT: s_mov_b32 s5, 0
+; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
@@ -2735,8 +2733,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_mov_b32_e32 v5, v0
; CGP-NEXT: v_mov_b32_e32 v7, v1
-; CGP-NEXT: s_movk_i32 s4, 0x1000
-; CGP-NEXT: s_mov_b32 s5, 0
+; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6
; CGP-NEXT: v_or_b32_e32 v1, v7, v11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 580dc2f4b81d4..69cb3b956cc25 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1651,8 +1651,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_urem_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s4, 0x1000
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
; CHECK-NEXT: v_mov_b32_e32 v2, 0
@@ -1827,8 +1826,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_urem_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_movk_i32 s4, 0x1000
-; GISEL-NEXT: s_mov_b32 s5, 0
+; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
@@ -2090,8 +2088,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_mov_b32_e32 v5, v0
; CGP-NEXT: v_mov_b32_e32 v7, v1
-; CGP-NEXT: s_movk_i32 s4, 0x1000
-; CGP-NEXT: s_mov_b32 s5, 0
+; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6
; CGP-NEXT: v_or_b32_e32 v1, v7, v11
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 2f752c2ceff15..c7a8d01fba838 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -67,6 +67,7 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -157,6 +158,7 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -268,6 +270,7 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -383,6 +386,7 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -464,6 +468,7 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@@ -534,6 +539,7 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@@ -612,6 +618,7 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -696,6 +703,7 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
@@ -768,6 +776,7 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@@ -835,6 +844,7 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@@ -912,6 +922,7 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -997,6 +1008,7 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
@@ -1249,6 +1261,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1542,6 +1555,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1915,6 +1929,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2308,6 +2323,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2567,6 +2583,7 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -2791,6 +2808,7 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3043,6 +3061,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3322,6 +3341,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3455,6 +3475,7 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3528,6 +3549,7 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@@ -3609,6 +3631,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3696,6 +3719,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
@@ -3843,6 +3867,7 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4023,6 +4048,7 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4219,6 +4245,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4438,6 +4465,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -4631,6 +4659,7 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4827,6 +4856,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5043,6 +5073,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5285,6 +5316,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5394,6 +5426,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5430,6 +5463,7 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5464,6 +5498,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5505,6 +5540,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5550,6 +5586,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5694,6 +5731,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -5776,6 +5814,7 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5814,6 +5853,7 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5849,6 +5889,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5892,6 +5933,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6025,6 +6067,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -6100,6 +6143,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6139,6 +6183,7 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6202,6 +6247,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -6275,6 +6321,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6329,6 +6376,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6511,6 +6559,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -6610,6 +6659,7 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6652,6 +6702,7 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6713,6 +6764,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -6787,6 +6839,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6960,6 +7013,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7159,6 +7213,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73
@@ -7303,6 +7358,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7340,6 +7396,7 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7384,6 +7441,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7525,6 +7583,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
@@ -7672,6 +7731,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7824,6 +7884,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8
@@ -7967,6 +8028,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7997,8 +8059,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mov_b32 s5, 0
-; GFX6-NEXT: s_movk_i32 s4, 0x1000
+; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
; GFX6-NEXT: s_add_u32 s4, s4, -1
; GFX6-NEXT: s_addc_u32 s5, s5, -1
@@ -8007,12 +8068,12 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: s_movk_i32 s0, 0x1000
+; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
@@ -8056,6 +8117,7 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -8094,8 +8156,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11
-; GFX6-NEXT: s_mov_b32 s13, 0
-; GFX6-NEXT: s_movk_i32 s12, 0x1000
+; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8113,13 +8174,13 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
-; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: s_movk_i32 s0, 0x1000
+; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
@@ -8267,6 +8328,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
@@ -8410,6 +8472,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8439,10 +8502,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT: s_mov_b32 s3, 0
-; GFX6-NEXT: s_movk_i32 s2, 0x1000
+; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX6-NEXT: s_ashr_i32 s12, s3, 31
@@ -8458,7 +8521,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s15, s14
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
@@ -8576,11 +8638,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_movk_i32 s2, 0x1000
+; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@@ -8753,6 +8815,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -8917,6 +8980,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000
@@ -9064,10 +9128,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GFX6-NEXT: s_mov_b32 s3, 0
-; GFX6-NEXT: s_movk_i32 s2, 0x1000
+; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_mov_b32 s18, 0x4f800000
; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc
+; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@@ -9078,12 +9142,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15
-; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_mov_b32 s21, 0xcf800000
; GFX6-NEXT: s_sub_u32 s6, 0, s14
+; GFX6-NEXT: s_subb_u32 s7, 0, s15
; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: s_subb_u32 s7, 0, s15
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0
@@ -9330,13 +9393,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_movk_i32 s2, 0x1000
+; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: s_mov_b32 s18, 0x4f800000
; GFX9-NEXT: s_mov_b32 s19, 0x5f7ffffc
+; GFX9-NEXT: s_mov_b32 s20, 0x2f800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@@ -9347,12 +9411,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT: s_mov_b32 s20, 0x2f800000
; GFX9-NEXT: s_mov_b32 s21, 0xcf800000
; GFX9-NEXT: s_sub_u32 s14, 0, s10
+; GFX9-NEXT: s_subb_u32 s4, 0, s11
; GFX9-NEXT: v_mac_f32_e32 v0, s18, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-NEXT: s_subb_u32 s4, 0, s11
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_mul_f32_e32 v0, s19, v0
; GFX9-NEXT: v_mul_f32_e32 v1, s20, v0
@@ -9727,6 +9790,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
@@ -9870,6 +9934,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -9901,10 +9966,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-LABEL: srem_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT: s_mov_b32 s3, 0
-; GFX6-NEXT: s_movk_i32 s2, 0x1000
+; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
@@ -9920,15 +9985,14 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s15, s14
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s4, s8
+; GFX6-NEXT: s_mov_b32 s5, s9
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: s_mov_b32 s5, s9
; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0
@@ -10036,11 +10100,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_movk_i32 s2, 0x1000
+; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@@ -10216,6 +10280,7 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -10265,10 +10330,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GFX6-NEXT: s_mov_b32 s3, 0
-; GFX6-NEXT: s_movk_i32 s2, 0x1000
+; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_mov_b32 s18, 0x4f800000
; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc
+; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@@ -10279,12 +10344,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17
-; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_mov_b32 s21, 0xcf800000
; GFX6-NEXT: s_sub_u32 s6, 0, s16
+; GFX6-NEXT: s_subb_u32 s7, 0, s17
; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: s_subb_u32 s7, 0, s17
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0
@@ -10527,13 +10591,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
+;
; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_movk_i32 s2, 0x1000
+; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: s_mov_b32 s16, 0x4f800000
; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc
+; GFX9-NEXT: s_mov_b32 s18, 0x2f800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s6
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@@ -10544,12 +10609,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[4:5]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
-; GFX9-NEXT: s_mov_b32 s18, 0x2f800000
; GFX9-NEXT: s_mov_b32 s19, 0xcf800000
; GFX9-NEXT: s_sub_u32 s4, 0, s14
+; GFX9-NEXT: s_subb_u32 s5, 0, s15
; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-NEXT: s_subb_u32 s5, 0, s15
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0
diff --git a/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
new file mode 100644
index 0000000000000..5f49f6d4ea8fc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
@@ -0,0 +1,98 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=liveintervals,amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+# GCN-LABEL: name: combine_sreg64_inits
+# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
+# GCN: S_NOP 0
+name: combine_sreg64_inits
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1
+ S_NOP 0
+ %0.sub1:sgpr_64 = S_MOV_B32 2
+...
+---
+# GCN-LABEL: name: combine_sreg64_inits_swap
+# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
+# GCN: S_NOP 0
+name: combine_sreg64_inits_swap
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub1:sgpr_64 = S_MOV_B32 2
+ S_NOP 0
+ %0.sub0:sgpr_64 = S_MOV_B32 1
+...
+---
+# GCN-LABEL: name: sreg64_inits_
diff erent_blocks
+# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
+# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
+name: sreg64_inits_
diff erent_blocks
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1
+
+ bb.1:
+ %0.sub1:sgpr_64 = S_MOV_B32 2
+...
+---
+# GCN-LABEL: name: sreg64_inits_two_defs_sub1
+# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
+# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
+# GCN: %0.sub1:sgpr_64 = S_MOV_B32 3
+name: sreg64_inits_two_defs_sub1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1
+ %0.sub1:sgpr_64 = S_MOV_B32 2
+ %0.sub1:sgpr_64 = S_MOV_B32 3
+...
+---
+# GCN-LABEL: name: sreg64_inits_two_defs_sub0
+# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
+# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
+# GCN: %0.sub0:sgpr_64 = S_MOV_B32 3
+name: sreg64_inits_two_defs_sub0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1
+ %0.sub1:sgpr_64 = S_MOV_B32 2
+ %0.sub0:sgpr_64 = S_MOV_B32 3
+...
+---
+# GCN-LABEL: name: sreg64_inits_full_def
+# GCN: undef %1.sub0:sgpr_64 = S_MOV_B32 1
+# GCN: %0:sgpr_64 = S_MOV_B64 3
+name: sreg64_inits_full_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1
+ %0:sgpr_64 = S_MOV_B64 3
+...
+---
+# GCN-LABEL: name: sreg64_inits_imp_use
+# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
+# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
+name: sreg64_inits_imp_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
+ %0.sub1:sgpr_64 = S_MOV_B32 2
+...
+---
+# GCN-LABEL: name: sreg64_inits_imp_def
+# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
+# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
+name: sreg64_inits_imp_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
+ %0.sub1:sgpr_64 = S_MOV_B32 2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 0630e1043575b..7fceb602a0ba9 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -844,8 +844,7 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
-; CI-NEXT: s_movk_i32 s0, 0x7b
-; CI-NEXT: s_mov_b32 s1, 0
+; CI-NEXT: s_mov_b64 s[0:1], 0x7b
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -856,8 +855,7 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
;
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s0, 0x7b
-; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 4aa5d518e11b0..98ee7d47f2f8e 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -205,8 +205,7 @@ entry:
; FIXME: Should not have intermediate sgprs
; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
-; CHECK-DAG: s_mov_b32 s1, 0
-; CHECK-DAG: s_mov_b32 s0, 0x1e240
+; CHECK: s_mov_b64 s[0:1], 0x1e240
; CHECK: v_mov_b32_e32 v0, s0
; CHECK: v_mov_b32_e32 v1, s1
; CHECK: use v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index dc1d442959616..06fa8ee65ddc6 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -59,20 +59,17 @@ define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
ret void
}
-; FIXME: Should be able to use s_mov_b64
; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
-; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
-; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
-; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], -4{{$}}
+; GCN: ; use [[REG]]
define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
tail call void asm sideeffect "; use $0", "s"(i64 -4)
ret void
}
; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
-; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
-; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
-; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], 1.0{{$}}
+; GCN: ; use [[REG]]
define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
tail call void asm sideeffect "; use $0", "s"(double 1.0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 558923f5cc050..30a1b31f11021 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -900,12 +900,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
; SI-NEXT: s_load_dword s4, s[4:5], 0x4
-; SI-NEXT: s_mov_b32 s5, 0
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s8, s4, 4
-; SI-NEXT: s_mov_b32 s4, 0xffff
+; SI-NEXT: s_mov_b64 s[4:5], 0xffff
; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
; SI-NEXT: s_mov_b32 s8, 0x50005
; SI-NEXT: s_and_b32 s9, s5, s8
@@ -923,12 +922,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
-; VI-NEXT: s_mov_b32 s5, 0
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s8, s4, 4
-; VI-NEXT: s_mov_b32 s4, 0xffff
+; VI-NEXT: s_mov_b64 s[4:5], 0xffff
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
; VI-NEXT: s_mov_b32 s8, 0x50005
; VI-NEXT: s_mov_b32 s9, s8
@@ -1075,14 +1073,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; SI-NEXT: s_load_dword s6, s[4:5], 0x4
-; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_lshl_b32 s8, s6, 3
-; SI-NEXT: s_mov_b32 s6, 0xffff
+; SI-NEXT: s_mov_b64 s[6:7], 0xffff
; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; SI-NEXT: s_mov_b32 s8, 0x5050505
; SI-NEXT: s_mov_b32 s1, s9
@@ -1100,14 +1097,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; VI-NEXT: s_load_dword s6, s[4:5], 0x10
-; VI-NEXT: s_mov_b32 s7, 0
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
; VI-NEXT: s_mov_b32 s0, s8
; VI-NEXT: s_lshl_b32 s8, s6, 3
-; VI-NEXT: s_mov_b32 s6, 0xffff
+; VI-NEXT: s_mov_b64 s[6:7], 0xffff
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; VI-NEXT: s_mov_b32 s8, 0x5050505
; VI-NEXT: s_mov_b32 s1, s9
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 3601886edc46f..3b545d2d5faa1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1584,8 +1584,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6
@@ -1607,9 +1606,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b32 s2, 0xffff
+; VI-NEXT: s_mov_b64 s[2:3], 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_and_b32 s1, s4, s2
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_lshl_b32 s0, s1, 16
@@ -1635,8 +1633,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: s_mov_b32 s3, 0
-; CI-NEXT: s_mov_b32 s2, 0xffff
+; CI-NEXT: s_mov_b64 s[2:3], 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_and_b32 s4, s4, s2
@@ -1672,8 +1669,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
; GFX9-NEXT: s_lshl_b32 s4, s7, 4
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -1694,9 +1690,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b32 s2, 0xffff
+; VI-NEXT: s_mov_b64 s[2:3], 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_lshl_b32 s1, s5, 4
; VI-NEXT: s_and_b32 s4, s4, s2
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
@@ -1722,10 +1717,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: s_mov_b32 s2, 0xffff
+; CI-NEXT: s_mov_b64 s[2:3], 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_and_b32 s6, s4, s2
-; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: s_lshl_b32 s1, s5, 4
; CI-NEXT: s_lshl_b32 s4, s4, 16
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 49ffd8d9fce27..1f1609c8b68db 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -7,7 +7,7 @@
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1 %s
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \
-; RUN: -amdgpu-load-store-vectorizer -debug-pass=Structure < %s 2>&1 \
+; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1-OPTS %s
; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O2 %s
@@ -619,6 +619,7 @@
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Simple Register Coalescing
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
+; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA optimizations
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
@@ -899,6 +900,7 @@
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Simple Register Coalescing
; GCN-O2-NEXT: Rename Disconnected Subregister Components
+; GCN-O2-NEXT: AMDGPU Pre-RA optimizations
; GCN-O2-NEXT: Machine Instruction Scheduler
; GCN-O2-NEXT: MachinePostDominator Tree Construction
; GCN-O2-NEXT: SI Whole Quad Mode
@@ -1193,6 +1195,7 @@
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Simple Register Coalescing
; GCN-O3-NEXT: Rename Disconnected Subregister Components
+; GCN-O3-NEXT: AMDGPU Pre-RA optimizations
; GCN-O3-NEXT: Machine Instruction Scheduler
; GCN-O3-NEXT: MachinePostDominator Tree Construction
; GCN-O3-NEXT: SI Whole Quad Mode
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index f0fa9382d00cc..62c9ab28bf7da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -77,9 +77,9 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s0, -1
; SI-NEXT: s_movk_i32 s7, 0xfc01
; SI-NEXT: s_mov_b32 s1, 0xfffff
-; SI-NEXT: s_mov_b32 s0, -1
; SI-NEXT: s_brev_b32 s6, -2
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 14d2feccea61d..91f676506a836 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -92,9 +92,8 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 0
-; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 1.0
-; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}}
+; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
+; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index ca41899b055e1..63beb537fd4f8 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -103,8 +103,7 @@ entry:
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
-; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
+; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
@@ -163,8 +162,7 @@ entry:
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
-; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
+; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
new file mode 100644
index 0000000000000..c7961b195542b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_remat_sgpr:
+; GCN-NOT: v_writelane_b32
+; GCN: {{^}}[[LOOP:BB[0-9_]+]]:
+; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x
+; GCN-NOT: v_writelane_b32
+; GCN: s_cbranch_{{[^ ]+}} [[LOOP]]
+; GCN: .sgpr_spill_count: 0
+define amdgpu_kernel void @test_remat_sgpr(double addrspace(1)* %arg, double addrspace(1)* %arg1) {
+bb:
+ %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+ br label %bb3
+
+bb2: ; preds = %bb3
+ ret void
+
+bb3: ; preds = %bb3, %bb
+ %i4 = phi i32 [ 0, %bb ], [ %i22, %bb3 ]
+ %i5 = add nuw nsw i32 %i4, %i
+ %i6 = zext i32 %i5 to i64
+ %i7 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %i6
+ %i8 = load double, double addrspace(1)* %i7, align 8
+ %i9 = fadd double %i8, 0x3EFC01997CC9E6B0
+ %i10 = tail call double @llvm.fma.f64(double %i8, double %i9, double 0x3FBE25E43ABE935A)
+ %i11 = tail call double @llvm.fma.f64(double %i10, double %i9, double 0x3FC110EF47E6C9C2)
+ %i12 = tail call double @llvm.fma.f64(double %i11, double %i9, double 0x3FC3B13BCFA74449)
+ %i13 = tail call double @llvm.fma.f64(double %i12, double %i9, double 0x3FC745D171BF3C30)
+ %i14 = tail call double @llvm.fma.f64(double %i13, double %i9, double 0x3FCC71C71C7792CE)
+ %i15 = tail call double @llvm.fma.f64(double %i14, double %i9, double 0x3FD24924924920DA)
+ %i16 = tail call double @llvm.fma.f64(double %i15, double %i9, double 0x3FD999999999999C)
+ %i17 = tail call double @llvm.fma.f64(double %i16, double %i9, double 0x3FD899999999899C)
+ %i18 = tail call double @llvm.fma.f64(double %i17, double %i9, double 0x3FD799999999799C)
+ %i19 = tail call double @llvm.fma.f64(double %i18, double %i9, double 0x3FD699999999699C)
+ %i20 = tail call double @llvm.fma.f64(double %i19, double %i9, double 0x3FD599999999599C)
+ %i21 = getelementptr inbounds double, double addrspace(1)* %arg1, i64 %i6
+ store double %i19, double addrspace(1)* %i21, align 8
+ %i22 = add nuw nsw i32 %i4, 1
+ %i23 = icmp eq i32 %i22, 1024
+ br i1 %i23, label %bb2, label %bb3
+}
+
+declare double @llvm.fma.f64(double, double, double)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 607ca5727eb0d..50b1ceac35ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1698,7 +1698,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v6, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
@@ -1706,7 +1706,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, v11
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1724,8 +1723,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index f2077aa2a1ad0..acadd3b5da444 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -146,8 +146,7 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
; GCN-LABEL: v_lshr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_movk_i32 s4, 0x41
-; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0x41
; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
; GCN-NEXT: v_mov_b32_e32 v3, s4
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 58f73310967bb..b62a21811426e 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1248,8 +1248,8 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrsp
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: s_movk_i32 s7, 0x11e
; SI-NEXT: s_mov_b32 s6, 0xab19b207
+; SI-NEXT: s_movk_i32 s7, 0x11e
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1266,8 +1266,8 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrsp
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_movk_i32 s1, 0x11e
; VI-NEXT: s_mov_b32 s0, 0xab19b207
+; VI-NEXT: s_movk_i32 s1, 0x11e
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1319,8 +1319,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: s_mov_b32 s7, 0
-; SI-NEXT: s_mov_b32 s6, 0x12d687
+; SI-NEXT: s_mov_b64 s[6:7], 0x12d687
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1337,8 +1336,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s1, 0
-; VI-NEXT: s_mov_b32 s0, 0x12d687
+; VI-NEXT: s_mov_b64 s[0:1], 0x12d687
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1927,8 +1925,7 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out,
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s1, 0
-; SI-NEXT: s_mov_b32 s0, 4.0
+; SI-NEXT: s_mov_b64 s[0:1], 0x40800000
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1942,8 +1939,7 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s1, 0
-; VI-NEXT: s_mov_b32 s0, 4.0
+; VI-NEXT: s_mov_b64 s[0:1], 0x40800000
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2039,8 +2035,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s1, 4.0
; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_mov_b32 s1, 4.0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2054,8 +2050,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s1, 4.0
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_mov_b32 s1, 4.0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2090,8 +2086,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s1, -4.0
; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_mov_b32 s1, -4.0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2105,8 +2101,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s1, -4.0
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_mov_b32 s1, -4.0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
index 61993fd9ff26e..39d9ac33278e2 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -589,13 +589,13 @@ endif:
; GCN-LABEL: {{^}}br_scc_eq_i64_simm16:
; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
-; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0
+; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; SI: v_cmp_eq_u64_e32
define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
entry:
- %cmp0 = icmp eq i64 %cond, 1234
+ %cmp0 = icmp eq i64 %cond, 4294968530
br i1 %cmp0, label %endif, label %if
if:
@@ -627,13 +627,13 @@ endif:
; GCN-LABEL: {{^}}br_scc_ne_i64_simm16:
; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
-; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0
+; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; SI: v_cmp_ne_u64_e32
define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
entry:
- %cmp0 = icmp ne i64 %cond, 1234
+ %cmp0 = icmp ne i64 %cond, 4294968530
br i1 %cmp0, label %endif, label %if
if:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 74177c4394317..9b0c2abaedf09 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1876,14 +1876,13 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, v9
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1901,8 +1900,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index bc38031288e7e..f4d7bdf601024 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1249,14 +1249,13 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v2, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v3, v9
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1274,8 +1273,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB9_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index d9c95cdc63e6f..a5b2cc0557680 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1269,14 +1269,13 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, v9
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1294,8 +1293,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB8_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index ebc041501be0d..9742041a9fc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -807,8 +807,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT: s_brev_b32 s9, -2
; GFX9-O3-NEXT: s_mov_b32 s8, -1
+; GFX9-O3-NEXT: s_brev_b32 s9, -2
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8
More information about the llvm-commits
mailing list