[llvm] b434051 - [AMDGPU] Introduce SIInstrWorklist to process instructions in moveToVALU
via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 9 23:43:40 PDT 2023
Author: skc7
Date: 2023-04-10T11:34:14+05:30
New Revision: b434051dc83d77c8e8e349ab1992dcb0c795a7ea
URL: https://github.com/llvm/llvm-project/commit/b434051dc83d77c8e8e349ab1992dcb0c795a7ea
DIFF: https://github.com/llvm/llvm-project/commit/b434051dc83d77c8e8e349ab1992dcb0c795a7ea.diff
LOG: [AMDGPU] Introduce SIInstrWorklist to process instructions in moveToVALU
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D147168
Added:
Modified:
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/test/CodeGen/AMDGPU/add3.ll
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
llvm/test/CodeGen/AMDGPU/carryout-selection.ll
llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
llvm/test/CodeGen/AMDGPU/ds_read2.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
llvm/test/CodeGen/AMDGPU/mul.ll
llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
llvm/test/CodeGen/AMDGPU/sdiv.ll
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
llvm/test/CodeGen/AMDGPU/shl.ll
llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
llvm/test/CodeGen/AMDGPU/sra.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/CodeGen/AMDGPU/srl.ll
llvm/test/CodeGen/AMDGPU/sub.ll
llvm/test/CodeGen/AMDGPU/udiv.ll
llvm/test/CodeGen/AMDGPU/udiv64.ll
llvm/test/CodeGen/AMDGPU/udivrem.ll
llvm/test/CodeGen/AMDGPU/urem64.ll
llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 74ede650985ca..db323465c153f 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -869,7 +869,9 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
return true;
}
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
- TII->moveToVALU(MI, MDT);
+ SIInstrWorklist worklist;
+ worklist.insert(&MI);
+ TII->moveToVALU(worklist, MDT);
return true;
}
@@ -991,6 +993,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
LoweringWorklist.push_back(C.second.ID);
}
+ // Store all the V2S copy instructions that need to be moved to VALU
+ // in the Copies worklist.
+ SIInstrWorklist Copies;
+
while (!LoweringWorklist.empty()) {
unsigned CurID = LoweringWorklist.pop_back_val();
auto CurInfoIt = V2SCopies.find(CurID);
@@ -1013,10 +1019,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
<< " is being turned to VALU\n");
V2SCopies.erase(C.ID);
- TII->moveToVALU(*C.Copy, MDT);
+ Copies.insert(C.Copy);
}
}
+ TII->moveToVALU(Copies, MDT);
+ Copies.clear();
+
// Now do actual lowering
for (auto C : V2SCopies) {
MachineInstr *MI = C.second.Copy;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 018f671291fb7..7ffcd1ba260dd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6161,424 +6161,427 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}
-MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
- MachineDominatorTree *MDT) const {
- SetVectorType Worklist;
- Worklist.insert(&TopInst);
- MachineBasicBlock *CreatedBB = nullptr;
- MachineBasicBlock *CreatedBBTmp = nullptr;
-
- while (!Worklist.empty()) {
- MachineInstr &Inst = *Worklist.pop_back_val();
- MachineBasicBlock *MBB = Inst.getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
- unsigned Opcode = Inst.getOpcode();
- unsigned NewOpcode = getVALUOp(Inst);
-
- // Handle some special cases
- switch (Opcode) {
- default:
- break;
- case AMDGPU::S_ADD_U64_PSEUDO:
- case AMDGPU::S_SUB_U64_PSEUDO:
- splitScalar64BitAddSub(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
- case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32: {
- // FIXME: The u32 versions currently selected use the carry.
- bool Changed;
- std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
- if (Changed)
- continue;
-
- // Default handling
- break;
- }
- case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
- Inst.eraseFromParent();
- continue;
-
- case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
- Inst.eraseFromParent();
- continue;
-
- case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
- Inst.eraseFromParent();
- continue;
-
- case AMDGPU::S_NAND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
- Inst.eraseFromParent();
- continue;
-
- case AMDGPU::S_NOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
- Inst.eraseFromParent();
- continue;
+void SIInstrWorklist::insert(MachineInstr *MI) {
+ InstrList.insert(MI);
+ // Add MBUF instructiosn to deferred list.
+ int RsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+ if (RsrcIdx != -1) {
+ DeferredList.insert(MI);
+ }
+}
- case AMDGPU::S_XNOR_B64:
- if (ST.hasDLInsts())
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
- else
- splitScalar64BitXnor(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
+bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
+ return DeferredList.contains(MI);
+}
- case AMDGPU::S_ANDN2_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
- Inst.eraseFromParent();
- continue;
+void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
+ MachineDominatorTree *MDT) const {
- case AMDGPU::S_ORN2_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
- Inst.eraseFromParent();
+ while (!Worklist.empty()) {
+ MachineInstr &Inst = *Worklist.top();
+ Worklist.erase_top();
+ // Skip MachineInstr in the deferred list.
+ if (Worklist.isDeferred(&Inst))
continue;
+ moveToVALUImpl(Worklist, MDT, Inst);
+ }
- case AMDGPU::S_BREV_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
- Inst.eraseFromParent();
- continue;
+ // Deferred list of instructions will be processed once
+ // all the MachineInstr in the worklist are done.
+ for (MachineInstr *Inst : Worklist.getDeferredList()) {
+ moveToVALUImpl(Worklist, MDT, *Inst);
+ assert(Worklist.empty() &&
+ "Deferred MachineInstr are not supposed to re-populate worklist");
+ }
+}
- case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
- Inst.eraseFromParent();
- continue;
+void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
+ MachineDominatorTree *MDT,
+ MachineInstr &Inst) const {
- case AMDGPU::S_BCNT1_I32_B64:
- splitScalar64BitBCNT(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ MachineBasicBlock *MBB = Inst.getParent();
+ if (!MBB)
+ return;
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = getVALUOp(Inst);
+ // Handle some special cases
+ switch (Opcode) {
+ default:
+ break;
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ splitScalar64BitAddSub(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32: {
+ // FIXME: The u32 versions currently selected use the carry.
+ bool Changed;
+ MachineBasicBlock *CreatedBBTmp = nullptr;
+ std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
+ if (Changed)
+ return;
- case AMDGPU::S_BFE_I64:
- splitScalar64BitBFE(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ // Default handling
+ break;
+ }
+ case AMDGPU::S_AND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_LSHL_B32:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_ASHR_I32:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_LSHR_B32:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_LSHL_B64:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_ASHR_I64:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_LSHR_B64:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
- swapOperands(Inst);
- }
- break;
+ case AMDGPU::S_OR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_ABS_I32:
- lowerScalarAbs(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_XOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_CBRANCH_SCC0:
- case AMDGPU::S_CBRANCH_SCC1: {
- // Clear unused bits of vcc
- Register CondReg = Inst.getOperand(1).getReg();
- bool IsSCC = CondReg == AMDGPU::SCC;
- Register VCC = RI.getVCC();
- Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
- .addReg(EXEC)
- .addReg(IsSCC ? VCC : CondReg);
- Inst.removeOperand(1);
- }
- break;
+ case AMDGPU::S_NAND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_BFE_U64:
- case AMDGPU::S_BFM_B64:
- llvm_unreachable("Moving this op to VALU not implemented");
+ case AMDGPU::S_NOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_PACK_LL_B32_B16:
- case AMDGPU::S_PACK_LH_B32_B16:
- case AMDGPU::S_PACK_HL_B32_B16:
- case AMDGPU::S_PACK_HH_B32_B16:
- movePackToVALU(Worklist, MRI, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_XNOR_B64:
+ if (ST.hasDLInsts())
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ else
+ splitScalar64BitXnor(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_XNOR_B32:
- lowerScalarXnor(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_ANDN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_NAND_B32:
- splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_ORN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_NOR_B32:
- splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_BREV_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_ANDN2_B32:
- splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_NOT_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_ORN2_B32:
- splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_BCNT1_I32_B64:
+ splitScalar64BitBCNT(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- // TODO: remove as soon as everything is ready
- // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
- // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
- // can only be selected from the uniform SDNode.
- case AMDGPU::S_ADD_CO_PSEUDO:
- case AMDGPU::S_SUB_CO_PSEUDO: {
- unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
- ? AMDGPU::V_ADDC_U32_e64
- : AMDGPU::V_SUBB_U32_e64;
- const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
- Register CarryInReg = Inst.getOperand(4).getReg();
- if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
- Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
- BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
- .addReg(CarryInReg);
- }
+ case AMDGPU::S_BFE_I64:
+ splitScalar64BitBFE(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- Register CarryOutReg = Inst.getOperand(1).getReg();
-
- Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
- MRI.getRegClass(Inst.getOperand(0).getReg())));
- MachineInstr *CarryOp =
- BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
- .addReg(CarryOutReg, RegState::Define)
- .add(Inst.getOperand(2))
- .add(Inst.getOperand(3))
- .addReg(CarryInReg)
- .addImm(0);
- CreatedBBTmp = legalizeOperands(*CarryOp);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
- MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
- addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
- Inst.eraseFromParent();
+ case AMDGPU::S_LSHL_B32:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+ swapOperands(Inst);
}
- continue;
- case AMDGPU::S_UADDO_PSEUDO:
- case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
- MachineOperand &Dest0 = Inst.getOperand(0);
- MachineOperand &Dest1 = Inst.getOperand(1);
- MachineOperand &Src0 = Inst.getOperand(2);
- MachineOperand &Src1 = Inst.getOperand(3);
-
- unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
- ? AMDGPU::V_ADD_CO_U32_e64
- : AMDGPU::V_SUB_CO_U32_e64;
- const TargetRegisterClass *NewRC =
- RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
- Register DestReg = MRI.createVirtualRegister(NewRC);
- MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
- .addReg(Dest1.getReg(), RegState::Define)
- .add(Src0)
- .add(Src1)
- .addImm(0); // clamp bit
-
- CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
-
- MRI.replaceRegWith(Dest0.getReg(), DestReg);
- addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
- Worklist);
- Inst.eraseFromParent();
+ break;
+ case AMDGPU::S_ASHR_I32:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+ swapOperands(Inst);
}
- continue;
-
- case AMDGPU::S_CSELECT_B32:
- case AMDGPU::S_CSELECT_B64:
- lowerSelect(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
- case AMDGPU::S_CMP_EQ_I32:
- case AMDGPU::S_CMP_LG_I32:
- case AMDGPU::S_CMP_GT_I32:
- case AMDGPU::S_CMP_GE_I32:
- case AMDGPU::S_CMP_LT_I32:
- case AMDGPU::S_CMP_LE_I32:
- case AMDGPU::S_CMP_EQ_U32:
- case AMDGPU::S_CMP_LG_U32:
- case AMDGPU::S_CMP_GT_U32:
- case AMDGPU::S_CMP_GE_U32:
- case AMDGPU::S_CMP_LT_U32:
- case AMDGPU::S_CMP_LE_U32:
- case AMDGPU::S_CMP_EQ_U64:
- case AMDGPU::S_CMP_LG_U64: {
- const MCInstrDesc &NewDesc = get(NewOpcode);
- Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
- .add(Inst.getOperand(0))
- .add(Inst.getOperand(1));
- legalizeOperands(*NewInstr, MDT);
- int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
- addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
- Inst.eraseFromParent();
- }
- continue;
+ break;
+ case AMDGPU::S_LSHR_B32:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+ swapOperands(Inst);
}
-
- if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
- // We cannot move this instruction to the VALU, so we should try to
- // legalize its operands instead.
- CreatedBBTmp = legalizeOperands(Inst, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
- continue;
+ break;
+ case AMDGPU::S_LSHL_B64:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
+ swapOperands(Inst);
}
-
- // Handle converting generic instructions like COPY-to-SGPR into
- // COPY-to-VGPR.
- if (NewOpcode == Opcode) {
- Register DstReg = Inst.getOperand(0).getReg();
- const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
-
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
- MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
- MRI.clearKillFlags(Inst.getOperand(1).getReg());
- Inst.getOperand(0).setReg(DstReg);
-
- // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
- // these are deleted later, but at -O0 it would leave a suspicious
- // looking illegal copy of an undef register.
- for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
- Inst.removeOperand(I);
- Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
- continue;
- }
-
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- legalizeOperands(Inst, MDT);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- continue;
+ break;
+ case AMDGPU::S_ASHR_I64:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
+ swapOperands(Inst);
}
-
- // Use the new VALU Opcode.
- auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
- .setMIFlags(Inst.getFlags());
- for (const MachineOperand &Op : Inst.explicit_operands())
- NewInstr->addOperand(Op);
-
- // Remove any references to SCC. Vector instructions can't read from it, and
- // We're just about to add the implicit use / defs of VCC, and we don't want
- // both.
- for (MachineOperand &Op : Inst.implicit_operands()) {
- if (Op.getReg() == AMDGPU::SCC) {
- // Only propagate through live-def of SCC.
- if (Op.isDef() && !Op.isDead())
- addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
- if (Op.isUse())
- addSCCDefsToVALUWorklist(NewInstr, Worklist);
- }
+ break;
+ case AMDGPU::S_LSHR_B64:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
+ swapOperands(Inst);
}
+ break;
+ case AMDGPU::S_ABS_I32:
+ lowerScalarAbs(Worklist, Inst);
Inst.eraseFromParent();
+ return;
- Register NewDstReg;
- if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
- Register DstReg = NewInstr->getOperand(0).getReg();
- assert(DstReg.isVirtual());
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1: {
+ // Clear unused bits of vcc
+ Register CondReg = Inst.getOperand(1).getReg();
+ bool IsSCC = CondReg == AMDGPU::SCC;
+ Register VCC = RI.getVCC();
+ Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
+ .addReg(EXEC)
+ .addReg(IsSCC ? VCC : CondReg);
+ Inst.removeOperand(1);
+ } break;
+
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_BFM_B64:
+ llvm_unreachable("Moving this op to VALU not implemented");
+
+ case AMDGPU::S_PACK_LL_B32_B16:
+ case AMDGPU::S_PACK_LH_B32_B16:
+ case AMDGPU::S_PACK_HL_B32_B16:
+ case AMDGPU::S_PACK_HH_B32_B16:
+ movePackToVALU(Worklist, MRI, Inst);
+ Inst.eraseFromParent();
+ return;
- // Update the destination register class.
- const TargetRegisterClass *NewDstRC =
- getDestEquivalentVGPRClass(*NewInstr);
- assert(NewDstRC);
+ case AMDGPU::S_XNOR_B32:
+ lowerScalarXnor(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- }
+ case AMDGPU::S_NAND_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ return;
- if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
- // We are converting these to a BFE, so we need to add the missing
- // operands for the size and offset.
- unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- NewInstr.addImm(0);
- NewInstr.addImm(Size);
- } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
- // The VALU version adds the second operand to the result, so insert an
- // extra 0 operand.
- NewInstr.addImm(0);
- }
+ case AMDGPU::S_NOR_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ return;
- if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
- const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
- // If we need to move this to VGPRs, we need to unpack the second operand
- // back into the 2 separate ones for bit offset and width.
- assert(OffsetWidthOp.isImm() &&
- "Scalar BFE is only implemented for constant width and offset");
- uint32_t Imm = OffsetWidthOp.getImm();
+ case AMDGPU::S_ANDN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ return;
- uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
- uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- NewInstr->removeOperand(2);
- NewInstr.addImm(Offset);
- NewInstr.addImm(BitWidth);
- }
+ case AMDGPU::S_ORN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ return;
- fixImplicitOperands(*NewInstr);
+ // TODO: remove as soon as everything is ready
+ // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
+ // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
+ // can only be selected from the uniform SDNode.
+ case AMDGPU::S_ADD_CO_PSEUDO:
+ case AMDGPU::S_SUB_CO_PSEUDO: {
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+ ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
+ Register CarryInReg = Inst.getOperand(4).getReg();
+ if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
+ Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
+ .addReg(CarryInReg);
+ }
+
+ Register CarryOutReg = Inst.getOperand(1).getReg();
+
+ Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
+ MRI.getRegClass(Inst.getOperand(0).getReg())));
+ MachineInstr *CarryOp =
+ BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
+ .addReg(CarryOutReg, RegState::Define)
+ .add(Inst.getOperand(2))
+ .add(Inst.getOperand(3))
+ .addReg(CarryInReg)
+ .addImm(0);
+ legalizeOperands(*CarryOp);
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
+ case AMDGPU::S_UADDO_PSEUDO:
+ case AMDGPU::S_USUBO_PSEUDO: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineOperand &Dest0 = Inst.getOperand(0);
+ MachineOperand &Dest1 = Inst.getOperand(1);
+ MachineOperand &Src0 = Inst.getOperand(2);
+ MachineOperand &Src1 = Inst.getOperand(3);
+
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+ ? AMDGPU::V_ADD_CO_U32_e64
+ : AMDGPU::V_SUB_CO_U32_e64;
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
+ .addReg(Dest1.getReg(), RegState::Define)
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp bit
+
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest0.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
- // Legalize the operands
- CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
+ case AMDGPU::S_CSELECT_B32:
+ case AMDGPU::S_CSELECT_B64:
+ lowerSelect(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_CMP_EQ_I32:
+ case AMDGPU::S_CMP_LG_I32:
+ case AMDGPU::S_CMP_GT_I32:
+ case AMDGPU::S_CMP_GE_I32:
+ case AMDGPU::S_CMP_LT_I32:
+ case AMDGPU::S_CMP_LE_I32:
+ case AMDGPU::S_CMP_EQ_U32:
+ case AMDGPU::S_CMP_LG_U32:
+ case AMDGPU::S_CMP_GT_U32:
+ case AMDGPU::S_CMP_GE_U32:
+ case AMDGPU::S_CMP_LT_U32:
+ case AMDGPU::S_CMP_LE_U32:
+ case AMDGPU::S_CMP_EQ_U64:
+ case AMDGPU::S_CMP_LG_U64: {
+ const MCInstrDesc &NewDesc = get(NewOpcode);
+ Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
+ .add(Inst.getOperand(0))
+ .add(Inst.getOperand(1));
+ legalizeOperands(*NewInstr, MDT);
+ int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
+ MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
+ Inst.eraseFromParent();
+ }
+ return;
+ }
- if (NewDstReg)
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+ // We cannot move this instruction to the VALU, so we should try to
+ // legalize its operands instead.
+ legalizeOperands(Inst, MDT);
+ return;
}
- return CreatedBB;
+ // Handle converting generic instructions like COPY-to-SGPR into
+ // COPY-to-VGPR.
+ if (NewOpcode == Opcode) {
+ Register DstReg = Inst.getOperand(0).getReg();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
+ NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+ MRI.clearKillFlags(Inst.getOperand(1).getReg());
+ Inst.getOperand(0).setReg(DstReg);
+ // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+ // these are deleted later, but at -O0 it would leave a suspicious
+ // looking illegal copy of an undef register.
+ for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+ Inst.removeOperand(I);
+ Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+ return;
+ }
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ legalizeOperands(Inst, MDT);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return;
+ }
+
+ // Use the new VALU Opcode.
+ auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
+ .setMIFlags(Inst.getFlags());
+ for (const MachineOperand &Op : Inst.explicit_operands())
+ NewInstr->addOperand(Op);
+ // Remove any references to SCC. Vector instructions can't read from it, and
+ // We're just about to add the implicit use / defs of VCC, and we don't want
+ // both.
+ for (MachineOperand &Op : Inst.implicit_operands()) {
+ if (Op.getReg() == AMDGPU::SCC) {
+ // Only propagate through live-def of SCC.
+ if (Op.isDef() && !Op.isDead())
+ addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
+ if (Op.isUse())
+ addSCCDefsToVALUWorklist(NewInstr, Worklist);
+ }
+ }
+ Inst.eraseFromParent();
+ Register NewDstReg;
+ if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
+ Register DstReg = NewInstr->getOperand(0).getReg();
+ assert(DstReg.isVirtual());
+ // Update the destination register class.
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
+ assert(NewDstRC);
+ NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ }
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ NewInstr.addImm(0);
+ NewInstr.addImm(Size);
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ NewInstr.addImm(0);
+ }
+ if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second operand
+ // back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+ NewInstr->removeOperand(2);
+ NewInstr.addImm(Offset);
+ NewInstr.addImm(BitWidth);
+ }
+ fixImplicitOperands(*NewInstr);
+ // Legalize the operands
+ legalizeOperands(*NewInstr, MDT);
+ if (NewDstReg)
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return;
}
// Add/sub require special handling to deal with carry outs.
std::pair<bool, MachineBasicBlock *>
-SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT) const {
if (ST.hasAddNoCarry()) {
// Assume there is no user of scc since we don't select this in that case.
@@ -6613,7 +6616,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
return std::pair(false, nullptr);
}
-void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -6689,7 +6692,7 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
+void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6716,7 +6719,7 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
+void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6781,7 +6784,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
}
}
-void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
MachineInstr &Inst,
unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -6810,7 +6813,7 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
-void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
MachineInstr &Inst,
unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -6839,9 +6842,9 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitUnaryOp(
- SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode, bool Swap) const {
+void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
+ MachineInstr &Inst, unsigned Opcode,
+ bool Swap) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6898,7 +6901,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist,
MachineInstr &Inst,
MachineDominatorTree *MDT) const {
bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
@@ -6972,7 +6975,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
MachineInstr &Inst, unsigned Opcode,
MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -7039,7 +7042,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst,
MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -7081,8 +7084,8 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
Worklist.insert(&Xor);
}
-void SIInstrInfo::splitScalar64BitBCNT(
- SetVectorType &Worklist, MachineInstr &Inst) const {
+void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
+ MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -7119,7 +7122,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -7181,9 +7184,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
}
void SIInstrInfo::addUsersToMoveToVALUWorklist(
- Register DstReg,
- MachineRegisterInfo &MRI,
- SetVectorType &Worklist) const {
+ Register DstReg, MachineRegisterInfo &MRI,
+ SIInstrWorklist &Worklist) const {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
E = MRI.use_end(); I != E;) {
MachineInstr &UseMI = *I->getParent();
@@ -7217,7 +7219,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
}
}
-void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
+void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const {
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -7292,7 +7294,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
- SetVectorType &Worklist,
+ SIInstrWorklist &Worklist,
Register NewCond) const {
// Ensure that def inst defines SCC, which is still live.
@@ -7335,7 +7337,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
// sure that the instruction that defines SCC is added to the moveToVALU
// worklist.
void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
- SetVectorType &Worklist) const {
+ SIInstrWorklist &Worklist) const {
// Look for a preceding instruction that either defines VCC or SCC. If VCC
// then there is nothing to do because the defining instruction has been
// converted to a VALU already. If SCC then that instruction needs to be
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index be1bc0d507e3e..10fed54fbdc40 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -41,6 +41,41 @@ class ScheduleHazardRecognizer;
static const MachineMemOperand::Flags MONoClobber =
MachineMemOperand::MOTargetFlag1;
+/// Utility to store machine instructions worklist.
+struct SIInstrWorklist {
+ SIInstrWorklist() : InstrList() {}
+
+ void insert(MachineInstr *MI);
+
+ MachineInstr *top() const {
+ auto iter = InstrList.begin();
+ return *iter;
+ }
+
+ void erase_top() {
+ auto iter = InstrList.begin();
+ InstrList.erase(iter);
+ }
+
+ bool empty() const { return InstrList.empty(); }
+
+ void clear() {
+ InstrList.clear();
+ DeferredList.clear();
+ }
+
+ bool isDeferred(MachineInstr *MI);
+
+ SetVector<MachineInstr *> &getDeferredList() { return DeferredList; }
+
+private:
+ /// InstrList contains the MachineInstrs.
+ SetVector<MachineInstr *> InstrList;
+ /// Deferred instructions are specific MachineInstr
+ /// that will be added by insert method.
+ SetVector<MachineInstr *> DeferredList;
+};
+
class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
const SIRegisterInfo RI;
@@ -81,57 +116,50 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
void swapOperands(MachineInstr &Inst) const;
std::pair<bool, MachineBasicBlock *>
- moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ void lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void lowerScalarAbs(SetVectorType &Worklist,
- MachineInstr &Inst) const;
+ void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
- void lowerScalarXnor(SetVectorType &Worklist,
- MachineInstr &Inst) const;
+ void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
- void splitScalarNotBinop(SetVectorType &Worklist,
- MachineInstr &Inst,
+ void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode) const;
- void splitScalarBinOpN2(SetVectorType &Worklist,
- MachineInstr &Inst,
+ void splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode) const;
- void splitScalar64BitUnaryOp(SetVectorType &Worklist,
- MachineInstr &Inst, unsigned Opcode,
- bool Swap = false) const;
+ void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ unsigned Opcode, bool Swap = false) const;
- void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst,
+ void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode,
MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT = nullptr) const;
+ void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitBCNT(SetVectorType &Worklist,
+ void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
MachineInstr &Inst) const;
- void splitScalar64BitBFE(SetVectorType &Worklist,
- MachineInstr &Inst) const;
- void movePackToVALU(SetVectorType &Worklist,
- MachineRegisterInfo &MRI,
+ void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
- SetVectorType &Worklist) const;
+ SIInstrWorklist &Worklist) const;
void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
- SetVectorType &Worklist,
+ SIInstrWorklist &Worklist,
Register NewCond = Register()) const;
void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
- SetVectorType &Worklist) const;
+ SIInstrWorklist &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
@@ -1008,11 +1036,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// was moved to VGPR. \returns true if succeeded.
bool moveFlatAddrToVGPR(MachineInstr &Inst) const;
- /// Replace this instruction's opcode with the equivalent VALU
- /// opcode. This function will also move the users of \p MI to the
- /// VALU if necessary. If present, \p MDT is updated.
- MachineBasicBlock *moveToVALU(MachineInstr &MI,
- MachineDominatorTree *MDT = nullptr) const;
+ /// Replace the instructions opcode with the equivalent VALU
+ /// opcode. This function will also move the users of MachineInstruntions
+ /// in the \p WorkList to the VALU if necessary. If present, \p MDT is
+ /// updated.
+ void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const;
+
+ void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT,
+ MachineInstr &Inst) const;
void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
index 3c9f2328f1185..d3f9c2d0fbc54 100644
--- a/llvm/test/CodeGen/AMDGPU/add3.ll
+++ b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -222,7 +222,7 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
; VI-NEXT: v_add_f32_e64 v1, s3, 2.0
; VI-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-NEXT: v_add_f32_e32 v2, s4, v2
-; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index cd38bf181fe08..edab417a03ced 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
@@ -150,7 +150,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: s_mul_i32 s0, s0, s3
@@ -261,7 +261,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_xor_b32 s0, s9, s8
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
; GFX6-NEXT: s_mul_i32 s1, s1, s3
@@ -1226,7 +1226,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3]
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4
; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3
@@ -1266,7 +1266,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3
; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7
; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[6:7]
-; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GFX6-NEXT: v_mul_hi_u32 v5, s11, v5
; GFX6-NEXT: v_readfirstlane_b32 s0, v5
; GFX6-NEXT: s_mul_i32 s0, s0, s15
@@ -1538,7 +1538,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -1556,7 +1556,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -1871,7 +1871,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: s_xor_b32 s5, s5, s4
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: s_xor_b32 s8, s4, s2
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
; GFX6-NEXT: s_mul_i32 s2, s2, s3
@@ -1901,7 +1901,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: s_xor_b32 s7, s7, s6
; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3
; GFX6-NEXT: s_xor_b32 s9, s6, s4
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2
; GFX6-NEXT: v_readfirstlane_b32 s4, v2
; GFX6-NEXT: s_mul_i32 s4, s4, s5
@@ -1963,7 +1963,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: s_xor_b32 s1, s1, s0
; GFX6-NEXT: v_mul_hi_u32 v2, v3, v2
; GFX6-NEXT: s_xor_b32 s0, s0, s10
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s12, v4
; GFX6-NEXT: v_readfirstlane_b32 s2, v3
@@ -2318,7 +2318,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: s_add_i32 s5, s5, s8
; GFX6-NEXT: s_xor_b32 s5, s5, s8
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
; GFX6-NEXT: s_mul_i32 s2, s2, s4
@@ -2344,7 +2344,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX6-NEXT: s_xor_b32 s5, s5, s4
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: s_sub_i32 s6, s2, s8
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
; GFX6-NEXT: s_mul_i32 s2, s2, s3
@@ -5345,7 +5345,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6|
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5
; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2
@@ -5754,7 +5754,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3]
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX6-NEXT: v_readfirstlane_b32 s0, v1
; GFX6-NEXT: s_mul_i32 s0, s0, s6
@@ -6090,7 +6090,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
; GFX6-NEXT: s_mul_i32 s7, s7, s6
@@ -6178,7 +6178,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -6269,7 +6269,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: s_mul_i32 s0, s0, s3
@@ -6416,7 +6416,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
@@ -6553,7 +6553,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b32 s4, s4, s6
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: s_xor_b32 s6, s6, s3
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_mul_i32 s3, s3, s2
@@ -6586,7 +6586,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b32 s5, s5, s8
; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3
; GFX6-NEXT: s_xor_b32 s4, s8, s4
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX6-NEXT: v_readfirstlane_b32 s6, v1
; GFX6-NEXT: s_mul_i32 s6, s6, s7
@@ -6797,7 +6797,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
; GFX6-NEXT: s_mul_i32 s7, s7, s4
@@ -7043,7 +7043,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b32 s5, s2, s8
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
; GFX6-NEXT: s_mul_i32 s7, s7, s4
@@ -7221,9 +7221,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9
; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9
; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -7571,7 +7571,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4
; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -7907,8 +7907,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12
; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1
; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
@@ -8295,7 +8295,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -8314,9 +8314,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -8359,7 +8359,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0
; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, s7
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s6, v8
; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
@@ -8607,7 +8607,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0
; GFX6-NEXT: s_addc_u32 s3, s3, s12
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -8667,9 +8667,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0
; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0
; GFX6-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -8964,7 +8964,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -8988,9 +8988,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
; GFX6-NEXT: s_mov_b32 s9, s8
; GFX6-NEXT: s_addc_u32 s3, s3, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -9030,7 +9030,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0
; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, s3
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8
; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
@@ -9251,7 +9251,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0
; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -9272,9 +9272,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0
; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0
; GFX6-NEXT: s_mov_b32 s11, 0xf000
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -9366,7 +9366,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4
; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4
@@ -9387,9 +9387,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3
; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2
; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4
; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4
@@ -9818,7 +9818,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -9861,7 +9861,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s7
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -10111,7 +10111,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0
; GFX6-NEXT: s_addc_u32 s3, s3, s10
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -10133,9 +10133,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0
; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -10485,7 +10485,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -10505,9 +10505,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -10596,7 +10596,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_ashr_i32 s14, s7, 31
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3
; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3
@@ -10619,9 +10619,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2
; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2
; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4
; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5
; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 69e695c49095c..6837ebce62343 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -1801,7 +1801,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; CISI-NEXT: v_mul_lo_u32 v5, s1, v0
; CISI-NEXT: v_mul_lo_u32 v4, s0, v0
; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CISI-NEXT: v_mul_hi_u32 v3, v0, v4
; CISI-NEXT: v_mul_lo_u32 v5, v0, v2
; CISI-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -1858,9 +1858,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; CISI-NEXT: v_mul_hi_u32 v3, s2, v0
; CISI-NEXT: v_mul_lo_u32 v4, s3, v0
; CISI-NEXT: v_mov_b32_e32 v5, s3
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CISI-NEXT: v_mul_lo_u32 v3, s2, v0
-; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CISI-NEXT: v_sub_i32_e32 v4, vcc, s7, v2
; CISI-NEXT: v_sub_i32_e32 v3, vcc, s6, v3
; CISI-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -1950,7 +1950,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
; VI-NEXT: v_mul_lo_u32 v3, s9, v5
; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3
; VI-NEXT: v_mul_hi_u32 v6, v5, v0
; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1
@@ -1969,8 +1969,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: v_mul_lo_u32 v5, s9, v6
; VI-NEXT: v_mul_hi_u32 v8, v6, v0
; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v4
-; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5
; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0
; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 44a612edcfb0d..f3b746f23af8a 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -43,7 +43,7 @@
; Spill val register
-; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
+; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[RELOAD_LOAD0]], [[LOAD1]]
; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: [[ENDIF]]:
@@ -110,15 +110,15 @@ endif:
; GCN: [[LOOP:.LBB[0-9]+_[0-9]+]]:
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
-; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
+; GCN: v_sub_i32_e32 v[[VAL_LOOP_RELOAD]], vcc, v[[VAL_LOOP_RELOAD]], v{{[0-9]+}}
; GCN: s_cmp_lg_u32
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
-; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN-NEXT: s_cbranch_scc1 [[LOOP]]
-; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: [[END]]:
; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
@@ -219,14 +219,14 @@ end:
; GCN: ; %bb.{{[0-9]+}}: ; %if
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: ds_read_b32
-; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
-; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
+; GCN: v_add_i32_e32 v[[LOAD0_RELOAD]], vcc, v[[LOAD0_RELOAD]], [[ADD:v[0-9]+]]
+; GCN: buffer_store_dword v[[LOAD0_RELOAD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
; GCN-NEXT: s_branch [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: [[ELSE]]: ; %else
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
-; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
-; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: v_sub_i32_e32 v[[LOAD0_RELOAD]], vcc, v[[LOAD0_RELOAD]], v{{[0-9]+}}
+; GCN: buffer_store_dword v[[LOAD0_RELOAD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN-NEXT: s_branch [[FLOW]]
; GCN: [[ENDIF]]:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 8fd43c5811084..9ec9414d91171 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -942,7 +942,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out)
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index d15a60e3ecb7f..458a92944e809 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -396,7 +396,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp
; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
; GCN: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
-; GCN: v_add_{{[iu]}}32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
+; GCN: v_add_{{[iu]}}32_e32 [[TMP1:v[0-9]+]], vcc, [[BFE]], [[TMP0]]
; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
; GCN: buffer_store_dword [[TMP2]]
define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index eb1b4f0fa7d06..3df5344ae908a 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -154,9 +154,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v7
; W64-O0-NEXT: v_mov_b32_e32 v2, v6
@@ -500,9 +497,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v15, v5
; W64-O0-NEXT: s_waitcnt vmcnt(3)
@@ -518,9 +512,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v3, v8
; W64-O0-NEXT: v_mov_b32_e32 v4, v7
@@ -532,7 +523,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v2, v12
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
@@ -540,7 +530,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v10
; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
@@ -1007,9 +996,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v11
; W64-O0-NEXT: v_mov_b32_e32 v2, v10
@@ -1018,9 +1004,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
-; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v14, v7
; W64-O0-NEXT: v_mov_b32_e32 v15, v6
@@ -1032,7 +1015,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec
; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v5, v12
; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index a0d0c05f5d5cf..6d3f1f14e9176 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -40,10 +40,7 @@ body: |
; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+ ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
; W64-NEXT: .1:
@@ -87,10 +84,7 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+ ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
@@ -160,10 +154,7 @@ body: |
; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
- ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+ ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
; W64-NEXT: .1:
@@ -207,10 +198,7 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
- ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+ ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
@@ -280,10 +268,7 @@ body: |
; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
- ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
+ ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
; W64-NEXT: .1:
@@ -327,10 +312,7 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
- ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
+ ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
@@ -399,18 +381,15 @@ body: |
; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
- ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
+ ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440
; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
- ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec
- ; ADDR64-NEXT: %17:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
- ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1
+ ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]].sub0, [[COPY1]].sub0, 0, implicit $exec
+ ; ADDR64-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY6]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec
; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
@@ -428,18 +407,15 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
- ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
+ ; W32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; W32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 822173696
; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
- ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec
- ; W32-NEXT: %17:vgpr_32, dead %20:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
- ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1
+ ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY6]].sub0, [[COPY1]].sub0, 0, implicit $exec
+ ; W32-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY6]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec
; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
@@ -484,16 +460,13 @@ body: |
; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
- ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
+ ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440
; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
- ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]].sub0, %subreg.sub0, [[COPY9]].sub1, %subreg.sub1
+ ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]].sub0, %subreg.sub0, [[COPY6]].sub1, %subreg.sub1
; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec
; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
@@ -512,10 +485,7 @@ body: |
; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+ ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NO-ADDR64-NEXT: {{ $}}
; W64-NO-ADDR64-NEXT: .1:
@@ -559,10 +529,7 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
- ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
- ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
- ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+ ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 648f44cad56f2..f214e9aecea5a 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; SI-NEXT: v_mul_lo_u32 v3, v3, v0
; SI-NEXT: v_mul_lo_u32 v0, v2, v0
; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -1230,7 +1230,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0
; VI-NEXT: v_mul_lo_u32 v0, v3, v0
; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index c8454531737f0..df49ff295371d 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -97,8 +97,7 @@ body: |
; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec
- ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
@@ -374,8 +373,7 @@ body: |
; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec
- ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
; GCN-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 killed [[REG_SEQUENCE1]].sub0, 12, implicit $exec
; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: S_BRANCH %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index c9a393d7bdc67..974cb71900a4d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -29,13 +29,13 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1
; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GCN-NEXT: v_xor_b32_e32 v0, v0, v5
; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
@@ -47,7 +47,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, v0, v1
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -55,7 +55,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -74,13 +74,13 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2
; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1
; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0
+; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v5
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5
; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
@@ -92,7 +92,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1
; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v0, v1
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -100,7 +100,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v2, v0
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
@@ -315,7 +315,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -338,7 +338,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
@@ -410,8 +410,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GCN-NEXT: v_xor_b32_e32 v2, v2, v5
@@ -428,16 +428,16 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GCN-NEXT: v_mul_lo_u32 v10, v10, v5
; GCN-NEXT: v_mul_lo_u32 v11, v11, v7
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6
; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
; GCN-NEXT: v_mul_hi_u32 v4, v5, v10
; GCN-NEXT: v_xor_b32_e32 v1, v1, v6
; GCN-NEXT: v_mul_hi_u32 v6, v7, v11
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v6
; GCN-NEXT: v_mul_hi_u32 v4, v0, v4
; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
; GCN-NEXT: v_mul_lo_u32 v6, v4, v2
@@ -448,9 +448,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1
+; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4
@@ -462,8 +462,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
; GCN-NEXT: v_xor_b32_e32 v1, v1, v9
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v8, v0
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v9, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -483,8 +483,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v5
+; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5
@@ -501,16 +501,16 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0
+; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5
; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v6
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6
; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v6
; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4
; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2
@@ -521,9 +521,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v0, v2
; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1
+; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v1, v3
; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4
@@ -535,8 +535,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v9, v1
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9
; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
@@ -824,22 +824,22 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4
; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5
; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GCN-NEXT: v_xor_b32_e32 v4, v4, v9
; GCN-NEXT: v_xor_b32_e32 v5, v5, v11
; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2
; GCN-NEXT: v_xor_b32_e32 v15, v8, v9
; GCN-NEXT: v_xor_b32_e32 v16, v10, v11
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v13
; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
; GCN-NEXT: v_xor_b32_e32 v1, v1, v10
; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v12
; GCN-NEXT: v_xor_b32_e32 v6, v6, v13
; GCN-NEXT: v_xor_b32_e32 v17, v12, v13
; GCN-NEXT: v_xor_b32_e32 v2, v2, v12
@@ -862,12 +862,12 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_hi_u32 v9, v8, v9
; GCN-NEXT: v_mul_hi_u32 v11, v10, v11
; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v14
; GCN-NEXT: v_mul_hi_u32 v13, v12, v13
; GCN-NEXT: v_xor_b32_e32 v7, v7, v14
; GCN-NEXT: v_cvt_f32_u32_e32 v18, v7
; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v10
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v11
; GCN-NEXT: v_mul_hi_u32 v8, v0, v8
; GCN-NEXT: v_mul_hi_u32 v9, v1, v9
; GCN-NEXT: v_add_i32_e32 v10, vcc, v12, v13
@@ -885,13 +885,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21
-; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v4, v0
+; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4
; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; GCN-NEXT: v_subrev_i32_e32 v12, vcc, v5, v1
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, v1, v5
; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3]
; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v7
; GCN-NEXT: v_add_i32_e32 v22, vcc, 1, v10
-; GCN-NEXT: v_subrev_i32_e32 v13, vcc, v6, v2
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, v2, v6
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
@@ -903,27 +903,27 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3
; GCN-NEXT: v_mul_hi_u32 v4, v18, v4
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GCN-NEXT: v_xor_b32_e32 v3, v3, v8
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v18
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4
; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; GCN-NEXT: v_mul_hi_u32 v4, v3, v4
; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5]
; GCN-NEXT: v_xor_b32_e32 v0, v0, v15
; GCN-NEXT: v_xor_b32_e32 v1, v1, v16
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5]
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v15, v0
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v16
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v10
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc
; GCN-NEXT: v_mul_lo_u32 v5, v4, v7
; GCN-NEXT: v_xor_b32_e32 v2, v2, v17
-; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v17, v2
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
; GCN-NEXT: v_xor_b32_e32 v6, v8, v14
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4
-; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v7, v3
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v3, v7
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
@@ -931,7 +931,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GCN-NEXT: v_xor_b32_e32 v3, v3, v6
-; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: s_endpgm
;
@@ -955,22 +955,22 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
-; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v9
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v11
; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v10
; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9
; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11
; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9
; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6
+; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v13
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10
; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4
; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v12
; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13
; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12
@@ -993,12 +993,12 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9
; TONGA-NEXT: v_mul_hi_u32 v11, v10, v11
; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7
-; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v14
; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13
; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14
; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v7
; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v10
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v11
; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8
; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9
; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v13
@@ -1016,13 +1016,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5
; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21
-; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v4, v0
+; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4
; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; TONGA-NEXT: v_subrev_u32_e32 v12, vcc, v5, v1
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v1, v5
; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3]
; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v7
; TONGA-NEXT: v_add_u32_e32 v22, vcc, 1, v10
-; TONGA-NEXT: v_subrev_u32_e32 v13, vcc, v6, v2
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v2, v6
; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8
; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
@@ -1034,27 +1034,27 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3
; TONGA-NEXT: v_mul_hi_u32 v4, v18, v4
; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc
-; TONGA-NEXT: v_add_u32_e32 v3, vcc, v8, v3
+; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v8
; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8
-; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v18
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4
; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4
; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5]
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v15
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16
; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5]
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v0
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v1
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v16
; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v10
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc
; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17
-; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v17
; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5
; TONGA-NEXT: v_xor_b32_e32 v6, v8, v14
; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v7, v3
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v3, v7
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
@@ -1062,7 +1062,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6
-; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3
+; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; TONGA-NEXT: s_endpgm
;
@@ -2030,7 +2030,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v1, v3, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2038,7 +2038,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
@@ -2078,7 +2078,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2
; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2086,7 +2086,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; TONGA-NEXT: s_endpgm
@@ -2254,7 +2254,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
@@ -2286,7 +2286,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index db22f2e12ca7a..0f58c6a96bf52 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -37,7 +37,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
; GCN-NEXT: s_addc_u32 s3, s3, s12
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -97,9 +97,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_mul_hi_u32 v3, s10, v0
; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
; GCN-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GCN-NEXT: v_mul_lo_u32 v3, s10, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -961,7 +961,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1|
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1106,7 +1106,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -1127,9 +1127,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -1155,7 +1155,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v1, s3, v0
; GCN-NEXT: v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_mul_lo_u32 v2, s2, v0
; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index d5a1d8f649adf..fb857e484f6ff 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -74,7 +74,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; NOSDWA-NEXT: v_subrev_u32_e32 v2, vcc, v2, v3
+; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2
; NOSDWA-NEXT: flat_store_dword v[0:1], v2
; NOSDWA-NEXT: s_endpgm
;
@@ -88,7 +88,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX89-NEXT: v_mov_b32_e32 v0, s0
; GFX89-NEXT: v_mov_b32_e32 v1, s1
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: v_subrev_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX89-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
; GFX89-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index fc28fd7575040..855a44baf63ed 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1283,12 +1283,10 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; implicit-def: $sgpr4
-; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; implicit-def: $sgpr4
-; GCN-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-NEXT: v_mov_b32_e32 v3, v5
; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 20c9544f73bd2..4f94d2180d30a 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -83,9 +83,9 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, v7, v3
-; SI-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, v5, v1
+; SI-NEXT: v_lshl_b32_e32 v3, v3, v7
+; SI-NEXT: v_lshl_b32_e32 v2, v2, v6
+; SI-NEXT: v_lshl_b32_e32 v1, v1, v5
; SI-NEXT: v_lshl_b32_e32 v0, v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -153,7 +153,7 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshl_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index c70b0190e0c23..6b625ef8e9bdc 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -375,8 +375,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
- ; CHECK-NEXT: undef %693.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %693, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 7)
+ ; CHECK-NEXT: undef %624.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %624, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 7)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 7097f58004855..8ebdb29706983 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ashrrev_i32_e32 v3, v7, v3
-; SI-NEXT: v_ashrrev_i32_e32 v2, v6, v2
-; SI-NEXT: v_ashrrev_i32_e32 v1, v5, v1
+; SI-NEXT: v_ashr_i32_e32 v3, v3, v7
+; SI-NEXT: v_ashr_i32_e32 v2, v2, v6
+; SI-NEXT: v_ashr_i32_e32 v1, v1, v5
; SI-NEXT: v_ashr_i32_e32 v0, v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 27b551ad88376..46befaae998b5 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -29,7 +29,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
; GCN-NEXT: v_mul_lo_u32 v3, s13, v0
; GCN-NEXT: v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1
; GCN-NEXT: v_mov_b32_e32 v3, s13
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
@@ -910,7 +910,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -970,8 +970,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
; GCN-NEXT: v_mul_lo_u32 v3, s13, v0
; GCN-NEXT: v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s15, v1
; GCN-NEXT: v_mov_b32_e32 v3, s13
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s14, v0
@@ -1145,7 +1145,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1|
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GCN-NEXT: v_mul_lo_u32 v0, v1, v0
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2
@@ -1304,7 +1304,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v5, s3, v0
; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -1352,7 +1352,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v1, s9, v0
; GCN-NEXT: v_mul_hi_u32 v2, s8, v0
; GCN-NEXT: v_mul_lo_u32 v0, s8, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0
; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 5d21043a42b08..7bc13f41262ca 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshr_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -138,9 +138,9 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshr_b32_e32 v3, v3, v7
-; SI-NEXT: v_lshrrev_b32_e32 v2, v6, v2
-; SI-NEXT: v_lshrrev_b32_e32 v1, v5, v1
-; SI-NEXT: v_lshrrev_b32_e32 v0, v4, v0
+; SI-NEXT: v_lshr_b32_e32 v2, v2, v6
+; SI-NEXT: v_lshr_b32_e32 v1, v1, v5
+; SI-NEXT: v_lshr_b32_e32 v0, v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 67369b0f81875..48d093604c394 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -1,31 +1,132 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN1 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN2 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN3 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
-; GCN-LABEL: {{^}}s_sub_i32:
-; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}]
-; GCN: s_sub_i32 s{{[0-9]+}}, s[[#LOAD + 2]], s[[#LOAD + 3]]
define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; GCN1-LABEL: s_sub_i32:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_sub_i32 s0, s2, s3
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: v_mov_b32_e32 v0, s0
+; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: s_sub_i32:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: s_sub_i32 s2, s2, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: flat_store_dword v[0:1], v2
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: s_sub_i32:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v0, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: s_sub_i32 s2, s2, s3
+; GCN3-NEXT: v_mov_b32_e32 v1, s2
+; GCN3-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN3-NEXT: s_endpgm
%result = sub i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}s_sub_imm_i32:
-; GCN: s_load_dword [[A:s[0-9]+]]
-; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]]
define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
+; GCN1-LABEL: s_sub_imm_i32:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s3, 0xf000
+; GCN1-NEXT: s_mov_b32 s2, -1
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_sub_i32 s4, 0x4d2, s4
+; GCN1-NEXT: v_mov_b32_e32 v0, s4
+; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: s_sub_imm_i32:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: s_sub_i32 s2, 0x4d2, s2
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: flat_store_dword v[0:1], v2
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: s_sub_imm_i32:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v0, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: s_sub_i32 s0, 0x4d2, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s0
+; GCN3-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN3-NEXT: s_endpgm
%result = sub i32 1234, %a
store i32 %result, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_sub_i32:
-; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_i32:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s10, s6
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b32 s8, s2
+; GCN1-NEXT: s_mov_b32 s9, s3
+; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_i32:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; GCN2-NEXT: flat_store_dword v[2:3], v0
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_i32:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v2, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_u32_e32 v0, v0, v1
+; GCN3-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN3-NEXT: s_endpgm
%b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
%a = load i32, ptr addrspace(1) %in
%b = load i32, ptr addrspace(1) %b_ptr
@@ -34,23 +135,101 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; GCN-LABEL: {{^}}test_sub_imm_i32:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}}
-; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_imm_i32:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s10, s6
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b32 s8, s2
+; GCN1-NEXT: s_mov_b32 s9, s3
+; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, 0x7b, v0
+; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_imm_i32:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2
+; GCN2-NEXT: flat_store_dword v[0:1], v2
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_imm_i32:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v0, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dword v1, v0, s[2:3]
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_u32_e32 v1, 0x7b, v1
+; GCN3-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN3-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in
%result = sub i32 123, %a
store i32 %result, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_sub_v2i32:
-; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_v2i32:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s10, s6
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b32 s8, s2
+; GCN1-NEXT: s_mov_b32 s9, s3
+; GCN1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_v2i32:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GCN2-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_v2i32:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v4, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_u32_e32 v1, v1, v3
+; GCN3-NEXT: v_sub_u32_e32 v0, v0, v2
+; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GCN3-NEXT: s_endpgm
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
%a = load <2 x i32>, ptr addrspace(1) %in
%b = load <2 x i32>, ptr addrspace(1) %b_ptr
@@ -59,17 +238,65 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; GCN-LABEL: {{^}}test_sub_v4i32:
-; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_v4i32:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s10, s6
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b32 s8, s2
+; GCN1-NEXT: s_mov_b32 s9, s3
+; GCN1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN1-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_v4i32:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: s_add_u32 s2, s2, 16
+; GCN2-NEXT: s_addc_u32 s3, s3, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v8, s0
+; GCN2-NEXT: v_mov_b32_e32 v9, s1
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v5
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; GCN2-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_v4i32:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v8, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
+; GCN3-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_u32_e32 v3, v7, v3
+; GCN3-NEXT: v_sub_u32_e32 v2, v6, v2
+; GCN3-NEXT: v_sub_u32_e32 v1, v5, v1
+; GCN3-NEXT: v_sub_u32_e32 v0, v4, v0
+; GCN3-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN3-NEXT: s_endpgm
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
%a = load <4 x i32>, ptr addrspace(1) %in
%b = load <4 x i32>, ptr addrspace(1) %b_ptr
@@ -78,10 +305,61 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; GCN-LABEL: {{^}}test_sub_i16:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_i16:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s10, 0
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v1, 0
+; GCN1-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:2 glc
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
+; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_i16:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; GCN2-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_ushort v4, v[0:1] glc
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: flat_load_ushort v2, v[2:3] glc
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_sub_u16_e32 v2, v4, v2
+; GCN2-NEXT: flat_store_short v[0:1], v2
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_i16:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v0, 0
+; GCN3-NEXT: v_sub_u16_e32 v1, v1, v2
+; GCN3-NEXT: global_store_short v0, v1, s[0:1]
+; GCN3-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1
@@ -92,12 +370,61 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; GCN-LABEL: {{^}}test_sub_v2i16:
-; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GFX9: v_pk_sub_i16
define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_v2i16:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s10, 0
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v1, 0
+; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GCN1-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v2, v3
+; GCN1-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN1-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN1-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_v2i16:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u16_e32 v4, v0, v1
+; GCN2-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GCN2-NEXT: v_or_b32_e32 v0, v4, v0
+; GCN2-NEXT: flat_store_dword v[2:3], v0
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_v2i16:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN3-NEXT: v_mov_b32_e32 v2, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_pk_sub_i16 v0, v0, v1
+; GCN3-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN3-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
%b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1
@@ -108,15 +435,72 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; GCN-LABEL: {{^}}test_sub_v4i16:
-; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GFX9: v_pk_sub_i16
-; GFX9: v_pk_sub_i16
define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN1-LABEL: test_sub_v4i16:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s7, 0xf000
+; GCN1-NEXT: s_mov_b32 s10, 0
+; GCN1-NEXT: s_mov_b32 s11, s7
+; GCN1-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v1, 0
+; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
+; GCN1-NEXT: s_mov_b32 s6, -1
+; GCN1-NEXT: s_mov_b32 s4, s0
+; GCN1-NEXT: s_mov_b32 s5, s1
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GCN1-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GCN1-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GCN1-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v5, v7
+; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v6
+; GCN1-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN1-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN1-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN1-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN1-NEXT: v_or_b32_e32 v1, v1, v2
+; GCN1-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: test_sub_v4i16:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u16_e32 v6, v1, v3
+; GCN2-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GCN2-NEXT: v_sub_u16_e32 v3, v0, v2
+; GCN2-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GCN2-NEXT: v_or_b32_e32 v1, v6, v1
+; GCN2-NEXT: v_or_b32_e32 v0, v3, v0
+; GCN2-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: test_sub_v4i16:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN3-NEXT: v_mov_b32_e32 v4, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_pk_sub_i16 v1, v1, v3
+; GCN3-NEXT: v_pk_sub_i16 v0, v0, v2
+; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GCN3-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
%b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1
@@ -127,25 +511,112 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; GCN-LABEL: {{^}}s_sub_i64:
-; GCN: s_sub_u32
-; GCN: s_subb_u32
define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind {
+; GCN1-LABEL: s_sub_i64:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN1-NEXT: s_mov_b32 s3, 0xf000
+; GCN1-NEXT: s_mov_b32 s2, -1
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_sub_u32 s4, s4, s6
+; GCN1-NEXT: s_subb_u32 s5, s5, s7
+; GCN1-NEXT: v_mov_b32_e32 v0, s4
+; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: s_sub_i64:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: s_sub_u32 s2, s4, s6
+; GCN2-NEXT: s_subb_u32 s3, s5, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: s_sub_i64:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN3-NEXT: v_mov_b32_e32 v2, 0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: s_sub_u32 s0, s4, s6
+; GCN3-NEXT: s_subb_u32 s1, s5, s7
+; GCN3-NEXT: v_mov_b32_e32 v0, s0
+; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GCN3-NEXT: s_endpgm
%result = sub i64 %a, %b
store i64 %result, ptr addrspace(1) %out, align 8
ret void
}
-; GCN-LABEL: {{^}}v_sub_i64:
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-
-; VI: v_sub_u32_e32
-; VI: v_subb_u32_e32
-
-; GFX9: v_sub_co_u32_e32
-; GFX9: v_subb_co_u32_e32
define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind {
+; GCN1-LABEL: v_sub_i64:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN1-NEXT: s_mov_b32 s11, 0xf000
+; GCN1-NEXT: s_mov_b32 s14, 0
+; GCN1-NEXT: s_mov_b32 s15, s11
+; GCN1-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN1-NEXT: v_mov_b32_e32 v1, 0
+; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15]
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GCN1-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
+; GCN1-NEXT: s_mov_b32 s10, -1
+; GCN1-NEXT: s_mov_b32 s8, s4
+; GCN1-NEXT: s_mov_b32 s9, s5
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: v_sub_i64:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GCN2-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: v_sub_i64:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN3-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
+; GCN3-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, 0
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GCN3-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
@@ -156,22 +627,72 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
ret void
}
-; GCN-LABEL: {{^}}v_test_sub_v2i64:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; GCN1-LABEL: v_test_sub_v2i64:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN1-NEXT: s_mov_b32 s11, 0xf000
+; GCN1-NEXT: s_mov_b32 s14, 0
+; GCN1-NEXT: s_mov_b32 s15, s11
+; GCN1-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GCN1-NEXT: v_mov_b32_e32 v5, 0
+; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15]
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
+; GCN1-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64
+; GCN1-NEXT: s_mov_b32 s10, -1
+; GCN1-NEXT: s_mov_b32 s8, s4
+; GCN1-NEXT: s_mov_b32 s9, s5
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v6, v2
+; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
+; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: v_test_sub_v2i64:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: v_test_sub_v2i64:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN3-NEXT: v_lshlrev_b32_e32 v8, 4, v0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
+; GCN3-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v8, 0
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6
+; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GCN3-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
+; GCN3-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
@@ -182,34 +703,104 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
ret void
}
-; GCN-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; GCN1-LABEL: v_test_sub_v4i64:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN1-NEXT: s_mov_b32 s11, 0xf000
+; GCN1-NEXT: s_mov_b32 s14, 0
+; GCN1-NEXT: s_mov_b32 s15, s11
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GCN1-NEXT: v_lshlrev_b32_e32 v12, 5, v0
+; GCN1-NEXT: v_mov_b32_e32 v13, 0
+; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15]
+; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64
+; GCN1-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[0:3], 0 addr64
+; GCN1-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[0:3], 0 addr64 offset:16
+; GCN1-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16
+; GCN1-NEXT: s_mov_b32 s10, -1
+; GCN1-NEXT: s_mov_b32 s8, s4
+; GCN1-NEXT: s_mov_b32 s9, s5
+; GCN1-NEXT: s_waitcnt vmcnt(2)
+; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
+; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v15, v11, vcc
+; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
+; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v13, v9, vcc
+; GCN1-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: v_test_sub_v4i64:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_add_u32_e32 v8, vcc, s6, v0
+; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_add_u32_e32 v12, vcc, s0, v0
+; GCN2-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
+; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
+; GCN2-NEXT: v_add_u32_e32 v8, vcc, 16, v8
+; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN2-NEXT: v_add_u32_e32 v12, vcc, 16, v12
+; GCN2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GCN2-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
+; GCN2-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GCN2-NEXT: v_mov_b32_e32 v17, s5
+; GCN2-NEXT: v_mov_b32_e32 v16, s4
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: s_waitcnt vmcnt(2)
+; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN2-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GCN2-NEXT: s_waitcnt vmcnt(1)
+; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v10, v14
+; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc
+; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v8, v12
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: v_test_sub_v4i64:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN3-NEXT: v_lshlrev_b32_e32 v16, 5, v0
+; GCN3-NEXT: s_waitcnt lgkmcnt(0)
+; GCN3-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN3-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3]
+; GCN3-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16
+; GCN3-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v16, 0
+; GCN3-NEXT: s_waitcnt vmcnt(2)
+; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6
+; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GCN3-NEXT: s_waitcnt vmcnt(0)
+; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v10, v14
+; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc
+; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v12
+; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v13, vcc
+; GCN3-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16
+; GCN3-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
+; GCN3-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid
@@ -220,18 +811,44 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
ret void
}
-; Make sure the VOP3 form of sub is initially selected. Otherwise pair
-; of opies from/to VCC would be necessary
-
-; GCN-LABEL: {{^}}sub_select_vop3:
-; SI: v_subrev_i32_e64 v0, s[0:1], s0, v0
-; VI: v_subrev_u32_e64 v0, s[0:1], s0, v0
-; GFX9: v_subrev_u32_e32 v0, s0, v0
-
-; GCN: ; def vcc
-; GCN: ds_write_b32
-; GCN: ; use vcc
define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) {
+; GCN1-LABEL: sub_select_vop3:
+; GCN1: ; %bb.0:
+; GCN1-NEXT: v_subrev_i32_e64 v0, s[0:1], s0, v0
+; GCN1-NEXT: s_mov_b32 m0, -1
+; GCN1-NEXT: ;;#ASMSTART
+; GCN1-NEXT: ; def vcc
+; GCN1-NEXT: ;;#ASMEND
+; GCN1-NEXT: ds_write_b32 v0, v0
+; GCN1-NEXT: ;;#ASMSTART
+; GCN1-NEXT: ; use vcc
+; GCN1-NEXT: ;;#ASMEND
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: sub_select_vop3:
+; GCN2: ; %bb.0:
+; GCN2-NEXT: v_subrev_u32_e64 v0, s[0:1], s0, v0
+; GCN2-NEXT: s_mov_b32 m0, -1
+; GCN2-NEXT: ;;#ASMSTART
+; GCN2-NEXT: ; def vcc
+; GCN2-NEXT: ;;#ASMEND
+; GCN2-NEXT: ds_write_b32 v0, v0
+; GCN2-NEXT: ;;#ASMSTART
+; GCN2-NEXT: ; use vcc
+; GCN2-NEXT: ;;#ASMEND
+; GCN2-NEXT: s_endpgm
+;
+; GCN3-LABEL: sub_select_vop3:
+; GCN3: ; %bb.0:
+; GCN3-NEXT: v_subrev_u32_e32 v0, s0, v0
+; GCN3-NEXT: ;;#ASMSTART
+; GCN3-NEXT: ; def vcc
+; GCN3-NEXT: ;;#ASMEND
+; GCN3-NEXT: ds_write_b32 v0, v0
+; GCN3-NEXT: ;;#ASMSTART
+; GCN3-NEXT: ; use vcc
+; GCN3-NEXT: ;;#ASMEND
+; GCN3-NEXT: s_endpgm
%vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
%sub = sub i32 %v, %s
store i32 %sub, ptr addrspace(3) undef
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index a5b1fa844e0c2..21038e777d22d 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -32,7 +32,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; SI-NEXT: v_mul_lo_u32 v3, v2, v1
; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; SI-NEXT: v_subrev_i32_e32 v3, vcc, v1, v0
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -68,7 +68,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; VI-NEXT: v_mul_lo_u32 v3, v2, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
-; VI-NEXT: v_subrev_u32_e32 v3, vcc, v1, v0
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v0, v1
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -100,7 +100,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_mul_lo_u32 v5, v4, v1
; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4
; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v5
-; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v1, v0
+; GCN-NEXT: v_sub_u32_e32 v5, vcc, v0, v1
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
@@ -198,7 +198,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; SI-NEXT: v_mul_lo_u32 v1, s4, v0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: v_mul_hi_u32 v1, v0, v1
-; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_mul_hi_u32 v0, s2, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: s_mul_i32 s0, s0, s3
@@ -231,7 +231,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; VI-NEXT: v_mul_lo_u32 v1, s4, v0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: v_mul_hi_u32 v1, v0, v1
-; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_mul_hi_u32 v0, s2, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: s_mul_i32 s0, s0, s3
@@ -260,7 +260,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_mul_lo_u32 v1, s4, v0
; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_mul_i32 s4, s4, s3
@@ -372,7 +372,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_mul_lo_u32 v7, v7, v5
; SI-NEXT: v_mul_hi_u32 v6, v4, v6
; SI-NEXT: v_mul_hi_u32 v7, v5, v7
-; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; SI-NEXT: v_mul_hi_u32 v4, v0, v4
; SI-NEXT: v_mul_hi_u32 v5, v1, v5
@@ -384,9 +384,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5
; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; SI-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0
+; SI-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; SI-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1
+; SI-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4
@@ -427,7 +427,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_mul_lo_u32 v7, v7, v5
; VI-NEXT: v_mul_hi_u32 v6, v4, v6
; VI-NEXT: v_mul_hi_u32 v7, v5, v7
-; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7
; VI-NEXT: v_mul_hi_u32 v4, v0, v4
; VI-NEXT: v_mul_hi_u32 v5, v1, v5
@@ -439,9 +439,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5
; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; VI-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2
; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; VI-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1
+; VI-NEXT: v_sub_u32_e32 v7, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3]
; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4
@@ -478,7 +478,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mul_hi_u32 v8, v7, v8
-; GCN-NEXT: v_add_u32_e32 v6, vcc, v9, v6
+; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v9
; GCN-NEXT: v_mul_hi_u32 v6, v0, v6
; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GCN-NEXT: v_mul_hi_u32 v7, v1, v7
@@ -490,9 +490,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_sub_u32_e32 v1, vcc, v1, v10
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; GCN-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0
+; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GCN-NEXT: v_subrev_u32_e32 v9, vcc, v3, v1
+; GCN-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6
@@ -661,9 +661,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_mul_hi_u32 v13, v12, v13
; SI-NEXT: v_mul_hi_u32 v15, v14, v15
; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, v11, v10
+; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v11
; SI-NEXT: v_add_i32_e32 v10, vcc, v12, v13
-; SI-NEXT: v_add_i32_e32 v11, vcc, v15, v14
+; SI-NEXT: v_add_i32_e32 v11, vcc, v14, v15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_hi_u32 v8, v4, v8
; SI-NEXT: v_mul_hi_u32 v9, v5, v9
@@ -685,13 +685,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1
; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
-; SI-NEXT: v_subrev_i32_e32 v12, vcc, v0, v4
+; SI-NEXT: v_sub_i32_e32 v12, vcc, v4, v0
; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1]
-; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5
+; SI-NEXT: v_sub_i32_e32 v13, vcc, v5, v1
; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3]
-; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6
+; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2
; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5]
-; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7
+; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3
; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7]
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1]
; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8
@@ -756,9 +756,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_mul_hi_u32 v13, v12, v13
; VI-NEXT: v_mul_hi_u32 v15, v14, v15
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9
-; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10
+; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11
; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13
-; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14
+; VI-NEXT: v_add_u32_e32 v11, vcc, v14, v15
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_hi_u32 v8, v4, v8
; VI-NEXT: v_mul_hi_u32 v9, v5, v9
@@ -780,13 +780,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1
; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
-; VI-NEXT: v_subrev_u32_e32 v12, vcc, v0, v4
+; VI-NEXT: v_sub_u32_e32 v12, vcc, v4, v0
; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1]
-; VI-NEXT: v_subrev_u32_e32 v13, vcc, v1, v5
+; VI-NEXT: v_sub_u32_e32 v13, vcc, v5, v1
; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3]
-; VI-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6
+; VI-NEXT: v_sub_u32_e32 v14, vcc, v6, v2
; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5]
-; VI-NEXT: v_subrev_u32_e32 v15, vcc, v3, v7
+; VI-NEXT: v_sub_u32_e32 v15, vcc, v7, v3
; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7]
; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1]
; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8
@@ -851,9 +851,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_hi_u32 v15, v14, v15
; GCN-NEXT: v_mul_hi_u32 v17, v16, v17
; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11
-; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12
+; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13
; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15
-; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16
+; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v10, v4, v10
; GCN-NEXT: v_mul_hi_u32 v11, v5, v11
@@ -875,13 +875,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1
; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
-; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v0, v4
+; GCN-NEXT: v_sub_u32_e32 v14, vcc, v4, v0
; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1]
-; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v1, v5
+; GCN-NEXT: v_sub_u32_e32 v15, vcc, v5, v1
; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3]
-; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v2, v6
+; GCN-NEXT: v_sub_u32_e32 v16, vcc, v6, v2
; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5]
-; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v3, v7
+; GCN-NEXT: v_sub_u32_e32 v17, vcc, v7, v3
; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1]
; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10
@@ -1882,7 +1882,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_mul_lo_u32 v3, v1, v0
; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1
; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
-; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0
; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
@@ -1929,7 +1929,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_mul_lo_u32 v3, v1, v0
; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
-; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
@@ -1984,7 +1984,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v5, v4, v3
; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4
; GCN-NEXT: v_sub_u32_e32 v2, vcc, v2, v5
-; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2
+; GCN-NEXT: v_sub_u32_e32 v5, vcc, v2, v3
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -2385,7 +2385,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
; SI-NEXT: v_cvt_i32_f32_e32 v1, v1
; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -2411,7 +2411,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
; VI-NEXT: v_cvt_i32_f32_e32 v1, v1
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
@@ -2435,7 +2435,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4|
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GCN-NEXT: flat_store_byte v[0:1], v2
; GCN-NEXT: s_endpgm
;
@@ -2556,7 +2556,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; SI-NEXT: v_mul_lo_u32 v6, v2, s4
; SI-NEXT: s_mov_b32 s4, 0x186a0
; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
-; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; SI-NEXT: v_mul_lo_u32 v5, v2, v4
; SI-NEXT: v_mul_hi_u32 v7, v2, v6
; SI-NEXT: v_mul_hi_u32 v8, v2, v4
@@ -2632,7 +2632,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
; VI-NEXT: v_mul_lo_u32 v4, v7, s6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v8, vcc, v3, v4
; VI-NEXT: v_mul_hi_u32 v5, v6, v2
; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3
@@ -2649,7 +2649,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
; VI-NEXT: v_mul_lo_u32 v4, v7, s6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v3
; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
; VI-NEXT: v_mul_hi_u32 v8, v6, v2
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3
@@ -2719,7 +2719,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
; GCN-NEXT: v_mul_lo_u32 v4, v7, s6
; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3
+; GCN-NEXT: v_add_u32_e32 v8, vcc, v3, v4
; GCN-NEXT: v_mul_hi_u32 v5, v6, v2
; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3
@@ -2736,7 +2736,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
; GCN-NEXT: v_mul_lo_u32 v4, v7, s6
; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
-; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4
+; GCN-NEXT: v_add_u32_e32 v5, vcc, v4, v3
; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
; GCN-NEXT: v_mul_hi_u32 v8, v6, v2
; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 2e3c38df149ae..ba44b1c6a5c96 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -27,7 +27,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -86,9 +86,9 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_mul_hi_u32 v3, s8, v0
; GCN-NEXT: v_mul_lo_u32 v4, s9, v0
; GCN-NEXT: v_mov_b32_e32 v5, s9
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GCN-NEXT: v_mul_lo_u32 v3, s8, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -699,7 +699,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-NEXT: v_mul_lo_u32 v6, s8, v1
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
; GCN-NEXT: v_mul_hi_u32 v5, v1, v6
; GCN-NEXT: v_mul_hi_u32 v7, v1, v3
@@ -719,9 +719,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; GCN-NEXT: v_mul_lo_u32 v4, s8, v1
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GCN-NEXT: v_mul_lo_u32 v7, v1, v3
; GCN-NEXT: v_mul_hi_u32 v8, v1, v4
; GCN-NEXT: v_mul_hi_u32 v9, v1, v3
@@ -899,7 +899,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -920,9 +920,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
@@ -948,7 +948,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
; GCN-NEXT: v_mul_lo_u32 v1, s3, v0
; GCN-NEXT: v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_mul_lo_u32 v2, s2, v0
; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2
@@ -1366,7 +1366,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_hi_u32 v2, v0, s8
; GCN-NEXT: v_mul_lo_u32 v3, v1, s8
; GCN-NEXT: v_mul_lo_u32 v4, v0, s8
-; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GCN-NEXT: v_mul_lo_u32 v3, v0, v2
; GCN-NEXT: v_mul_hi_u32 v5, v0, v4
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GCN-NEXT: v_mov_b32_e32 v5, s7
; GCN-NEXT: v_sub_i32_e32 v8, vcc, s6, v8
; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
@@ -1536,8 +1536,8 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-NEXT: v_mul_hi_u32 v4, v2, s4
; GCN-NEXT: v_mul_lo_u32 v5, v3, s4
; GCN-NEXT: v_mul_lo_u32 v6, v2, s4
-; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_mul_lo_u32 v5, v2, v4
; GCN-NEXT: v_mul_hi_u32 v7, v2, v6
; GCN-NEXT: v_mul_hi_u32 v8, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 21ae4feb25322..9b9a1851e6095 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -175,7 +175,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -193,7 +193,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_readfirstlane_b32 s6, v0
; GFX6-NEXT: s_mul_i32 s6, s6, s7
@@ -223,7 +223,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -241,7 +241,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
@@ -362,7 +362,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -380,7 +380,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -450,7 +450,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -468,7 +468,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 79643026ea70f..d6fcda0a02c6b 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -29,7 +29,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
; GCN-NEXT: v_mul_lo_u32 v3, s13, v0
; GCN-NEXT: v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1
; GCN-NEXT: v_mov_b32_e32 v3, s13
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
@@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v5, s1, v0
; GCN-NEXT: v_mul_lo_u32 v4, s0, v0
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
@@ -778,7 +778,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v1, s7, v0
; GCN-NEXT: v_mul_hi_u32 v2, s6, v0
; GCN-NEXT: v_mul_lo_u32 v0, s6, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0
; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
@@ -912,8 +912,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
; GCN-NEXT: v_mul_lo_u32 v4, v1, s2
; GCN-NEXT: v_mul_lo_u32 v3, v0, s2
-; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
; GCN-NEXT: v_mul_lo_u32 v4, v0, v2
; GCN-NEXT: v_mul_hi_u32 v6, v0, v2
@@ -969,7 +969,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_lo_u32 v1, v1, 24
; GCN-NEXT: v_mul_hi_u32 v2, v0, 24
; GCN-NEXT: v_mul_lo_u32 v0, v0, 24
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 0a21a77b5b94a..5ba0c3f371ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -178,8 +178,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_]]
; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0
entry:
-; %break = icmp sgt i32 %bound, 0
-; br i1 %break, label %for.body, label %for.end
br label %for.body
for.body:
@@ -235,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
; SI-NEXT: bb.1.Flow:
; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9
- ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9
- ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9
- ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef [[COPY47:%[0-9]+]]:vgpr_32, %bb.0, %4, %bb.9
+ ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef [[COPY49:%[0-9]+]]:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef [[COPY51:%[0-9]+]]:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef [[COPY53:%[0-9]+]]:vgpr_32, %bb.9
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI-NEXT: S_BRANCH %bb.2
; SI-NEXT: {{ $}}
@@ -251,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
; SI-NEXT: bb.3:
; SI-NEXT: successors: %bb.4(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
- ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2
+ ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef [[COPY57:%[0-9]+]]:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
+ ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef [[COPY59:%[0-9]+]]:vgpr_32, %bb.4, [[PHI1]], %bb.2
; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
@@ -288,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
; SI-NEXT: bb.7:
; SI-NEXT: successors: %bb.8(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
- ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6
+ ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef [[COPY59:%[0-9]+]]:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
+ ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef [[COPY61:%[0-9]+]]:vgpr_32, %bb.8, [[COPY4]], %bb.6
; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
@@ -358,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: bb.1.Flow:
; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9
- ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9
- ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef [[COPY50:%[0-9]+]]:vgpr_32, %bb.0, %4, %bb.9
+ ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef [[COPY52:%[0-9]+]]:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef [[COPY54:%[0-9]+]]:vgpr_32, %bb.9
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI-NEXT: S_BRANCH %bb.2
; SI-NEXT: {{ $}}
@@ -373,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: bb.3:
; SI-NEXT: successors: %bb.4(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
+ ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef [[COPY56:%[0-9]+]]:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
@@ -409,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: bb.7:
; SI-NEXT: successors: %bb.8(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
+ ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef [[COPY58:%[0-9]+]]:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
@@ -477,8 +475,8 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
; SI-NEXT: {{ $}}
; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec
- ; SI-NEXT: %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
- ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1
+ ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
@@ -511,7 +509,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
; SI-NEXT: bb.6.sw.bb18:
; SI-NEXT: successors: %bb.5(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
+ ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef [[COPY38:%[0-9]+]]:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
; SI-NEXT: S_BRANCH %bb.5
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index f4765a3286187..8a951ca9f62fc 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -434,7 +434,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
@@ -586,7 +585,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4
@@ -723,7 +721,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
@@ -742,7 +739,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
@@ -770,9 +766,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
@@ -878,23 +871,23 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -918,24 +911,24 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
@@ -944,140 +937,126 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s12
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s13
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s14
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s15
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s16
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s18
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s19
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s20
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s21
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s22
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s23
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s24
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s25
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s26
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s27
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s28
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s29
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; kill: def $vgpr45 killed $vgpr45 killed $exec
-; GFX9-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 killed $exec
-; GFX9-O0-NEXT: ; kill: def $vgpr46 killed $vgpr46 killed $exec
-; GFX9-O0-NEXT: ; kill: def $vgpr44 killed $vgpr44 killed $exec
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v47, s25
+; GFX9-O0-NEXT: v_mov_b32_e32 v46, s26
+; GFX9-O0-NEXT: v_mov_b32_e32 v45, s27
+; GFX9-O0-NEXT: v_mov_b32_e32 v44, s28
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v26, v45
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, v47
+; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v46
+; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v45
; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v24, v44
+; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47
; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45
; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44
; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec
@@ -1100,28 +1079,23 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0
@@ -1220,27 +1194,27 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list