[llvm] daa4728 - [AMDGPU] Add CodeGen support for GFX12 s_mul_u64 (#75825)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 8 11:13:43 PST 2024
Author: Jay Foad
Date: 2024-01-08T19:13:38Z
New Revision: daa4728deed3d222ff163cfb963321938549ddf1
URL: https://github.com/llvm/llvm-project/commit/daa4728deed3d222ff163cfb963321938549ddf1
DIFF: https://github.com/llvm/llvm-project/commit/daa4728deed3d222ff163cfb963321938549ddf1.diff
LOG: [AMDGPU] Add CodeGen support for GFX12 s_mul_u64 (#75825)
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/test/CodeGen/AMDGPU/mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 8d4cad4c07bc74..0c77fe72595880 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -104,6 +104,13 @@ def foldable_fneg : GICombineRule<
[{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]),
(apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>;
+// Detects s_mul_u64 instructions whose higher bits are zero/sign extended.
+def smulu64 : GICombineRule<
+ (defs root:$smul, unsigned_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_MUL):$smul,
+ [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]),
+ (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>;
+
def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
def sign_extension_in_reg : GICombineRule<
@@ -149,7 +156,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, sign_extension_in_reg]> {
+ rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index dfbe5c7fed8826..aa235c07e99597 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -701,13 +701,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.maxScalar(0, S32);
}
- getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .minScalar(0, S16)
- .widenScalarToNextMultipleOf(0, 32)
- .custom();
+ if (ST.hasScalarSMulU64()) {
+ getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S64, S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .custom();
+ } else {
+ getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .custom();
+ }
assert(ST.hasMad64_32());
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 7b18e1f805d8f9..21bfab52c6c4b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -104,6 +104,14 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
void applyCombineSignExtendInReg(MachineInstr &MI,
MachineInstr *&MatchInfo) const;
+ // Find the s_mul_u64 instructions where the higher bits are either
+ // zero-extended or sign-extended.
+ bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
+ // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher
+ // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32
+ // bits are zero extended.
+ void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
+
private:
#define GET_GICOMBINER_CLASS_MEMBERS
#define AMDGPUSubtarget GCNSubtarget
@@ -419,6 +427,32 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
+ MachineInstr &MI, unsigned &NewOpcode) const {
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ if (MRI.getType(Src0) != LLT::scalar(64))
+ return false;
+
+ if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
+ KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
+ NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
+ return true;
+ }
+
+ if (KB->computeNumSignBits(Src1) >= 33 &&
+ KB->computeNumSignBits(Src0) >= 33) {
+ NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
+ return true;
+ }
+ return false;
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64(
+ MachineInstr &MI, unsigned &NewOpcode) const {
+ Helper.replaceOpcodeWith(MI, NewOpcode);
+}
+
// Pass boilerplate
// ================
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 92182ec0694266..ecb7bb9d1d9755 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2094,6 +2094,74 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
return true;
}
+// Break s_mul_u64 into 32-bit vector operations.
+void AMDGPURegisterBankInfo::applyMappingSMULU64(
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src0Regs.empty() && Src1Regs.empty());
+ applyDefaultMapping(OpdMapper);
+ return;
+ }
+
+ assert(DefRegs.size() == 2);
+ assert(Src0Regs.size() == Src1Regs.size() &&
+ (Src0Regs.empty() || Src0Regs.size() == 2));
+
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ MachineInstr &MI = OpdMapper.getMI();
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT HalfTy = LLT::scalar(32);
+
+ // Depending on where the source registers came from, the generic code may
+ // have decided to split the inputs already or not. If not, we still need to
+ // extract the values.
+
+ if (Src0Regs.empty())
+ split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
+ else
+ setRegsToType(MRI, Src0Regs, HalfTy);
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else
+ setRegsToType(MRI, Src1Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ // The multiplication is done as follows:
+ //
+ // Op1H Op1L
+ // * Op0H Op0L
+ // --------------------
+ // Op1H*Op0L Op1L*Op0L
+ // + Op1H*Op0H Op1L*Op0H
+ // -----------------------------------------
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
+ //
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
+ // value and that would overflow.
+ // The low 32-bit value is Op1L*Op0L.
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
+ // Op1L*Op0L).
+
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+
+ Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
+ Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
+ Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
+ Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
+ B.buildAdd(DefRegs[1], Add, MulHiLo);
+ B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
+
+ MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+ MI.eraseFromParent();
+}
+
void AMDGPURegisterBankInfo::applyMappingImpl(
MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -2394,13 +2462,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
+ // Special case for s_mul_u64. There is not a vector equivalent of
+ // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
+ // multiplications.
+ if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ applyMappingSMULU64(B, OpdMapper);
+ return;
+ }
+
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
break;
const RegisterBank *DstBank =
- OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank)
break;
@@ -2451,6 +2527,72 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
+ case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
+ case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
+ // This is a special case for s_mul_u64. We use
+ // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
+ // where the 33 higher bits are sign-extended and
+ // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
+ // where the 32 higher bits are zero-extended. In case scalar registers are
+ // selected, both opcodes are lowered as s_mul_u64. If the vector registers
+ // are selected, then G_AMDGPU_S_MUL_I64_I32 and
+ // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
+
+ // Insert basic copies.
+ applyDefaultMapping(OpdMapper);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg0 = MI.getOperand(1).getReg();
+ Register SrcReg1 = MI.getOperand(2).getReg();
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+ assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
+ "that handles only 64-bit operands.");
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+
+ // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
+ // with s_mul_u64 operation.
+ if (DstBank == &AMDGPU::SGPRRegBank) {
+ MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
+ MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
+ MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
+ MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
+ return;
+ }
+
+ // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
+ // with a vector mad.
+ assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
+ "The destination operand should be in vector registers.");
+
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Extract the lower subregister from the first operand.
+ Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
+ MRI.setType(Op0L, S32);
+ B.buildTrunc(Op0L, SrcReg0);
+
+ // Extract the lower subregister from the second operand.
+ Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
+ MRI.setType(Op1L, S32);
+ B.buildTrunc(Op1L, SrcReg1);
+
+ unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
+ ? AMDGPU::G_AMDGPU_MAD_U64_U32
+ : AMDGPU::G_AMDGPU_MAD_I64_I32;
+
+ MachineIRBuilder B(MI);
+ Register Zero64 = B.buildConstant(S64, 0).getReg(0);
+ MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
+ Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
+ B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
+ MI.eraseFromParent();
+ return;
+ }
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
@@ -3669,7 +3811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AND:
case AMDGPU::G_OR:
- case AMDGPU::G_XOR: {
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_MUL: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Size == 1) {
const RegisterBank *DstBank
@@ -3737,7 +3880,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PTRMASK:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
- case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
@@ -3755,6 +3897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SHUFFLE_VECTOR:
case AMDGPU::G_SBFX:
case AMDGPU::G_UBFX:
+ case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
+ case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index b5d16e70ab23a2..2bb5ef57fe0314 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -84,6 +84,9 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
bool applyMappingMAD_64_32(MachineIRBuilder &B,
const OperandsMapper &OpdMapper) const;
+ void applyMappingSMULU64(MachineIRBuilder &B,
+ const OperandsMapper &OpdMapper) const;
+
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ce3164d7b92e5d..f6f37f5170a403 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -683,6 +683,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
+ bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
+
bool hasUnpackedD16VMem() const {
return HasUnpackedD16VMem;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e865c73015d29f..209debb3a10581 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -835,6 +835,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
+ if (Subtarget->hasScalarSMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Custom);
+
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
@@ -5566,7 +5569,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL:
case ISD::ADD:
case ISD::SUB:
- case ISD::MUL:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
@@ -5580,6 +5582,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SADDSAT:
case ISD::SSUBSAT:
return splitBinaryVectorOp(Op, DAG);
+ case ISD::MUL:
+ return lowerMUL(Op, DAG);
case ISD::SMULO:
case ISD::UMULO:
return lowerXMULO(Op, DAG);
@@ -6235,6 +6239,66 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
}
+// Custom lowering for vector multiplications and s_mul_u64.
+SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // Split vector operands.
+ if (VT.isVector())
+ return splitBinaryVectorOp(Op, DAG);
+
+ assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
+
+ // There are four ways to lower s_mul_u64:
+ //
+ // 1. If all the operands are uniform, then we lower it as it is.
+ //
+ // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
+ // multiplications because there is not a vector equivalent of s_mul_u64.
+ //
+ // 3. If the cost model decides that it is more efficient to use vector
+ // registers, then we have to split s_mul_u64 in 32-bit multiplications.
+ // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
+ //
+ // 4. If the cost model decides to use vector registers and both of the
+ // operands are zero-extended/sign-extended from 32-bits, then we split the
+ // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
+ // possible to check if the operands are zero-extended or sign-extended in
+ // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
+ // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
+ // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
+ // If the cost model decides that we have to use vector registers, then
+ // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
+ // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
+ // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
+ // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
+ // SIInstrInfo.cpp .
+
+ if (Op->isDivergent())
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
+ // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
+ // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
+ KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
+ unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
+ KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
+ unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
+ SDLoc SL(Op);
+ if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
+ return SDValue(
+ DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
+ unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
+ unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
+ if (Op0SignBits >= 33 && Op1SignBits >= 33)
+ return SDValue(
+ DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
+ // If all the operands are uniform, then we lower s_mul_u64 as it is.
+ return Op;
+}
+
SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc SL(Op);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 00f9ddf11ea7a5..92b38ebade6217 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 67992929ab3567..d4c7a457e9aae2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+
+ case AMDGPU::S_MUL_U64_U32_PSEUDO:
+ case AMDGPU::S_MUL_I64_I32_PSEUDO:
+ MI.setDesc(get(AMDGPU::S_MUL_U64));
+ break;
}
return true;
}
@@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Default handling
break;
}
+
+ case AMDGPU::S_MUL_U64:
+ // Split s_mul_u64 in 32-bit vector multiplications.
+ splitScalarSMulU64(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+
+ case AMDGPU::S_MUL_U64_U32_PSEUDO:
+ case AMDGPU::S_MUL_I64_I32_PSEUDO:
+ // This is a special case of s_mul_u64 where all the operands are either
+ // zero extended or sign extended.
+ splitScalarSMulPseudo(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_AND_B64:
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
Inst.eraseFromParent();
@@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
+// There is not a vector equivalent of s_mul_u64. For this reason, we need to
+// split the s_mul_u64 in 32-bit vector multiplications.
+void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+ const TargetRegisterClass *Src0SubRC =
+ RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src0SubRC))
+ Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
+ const TargetRegisterClass *Src1SubRC =
+ RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src1SubRC))
+ Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
+
+ // First, we extract the low 32-bit and high 32-bit values from each of the
+ // operands.
+ MachineOperand Op0L =
+ buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Op1L =
+ buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+ MachineOperand Op0H =
+ buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand Op1H =
+ buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ // The multilication is done as follows:
+ //
+ // Op1H Op1L
+ // * Op0H Op0L
+ // --------------------
+ // Op1H*Op0L Op1L*Op0L
+ // + Op1H*Op0H Op1L*Op0H
+ // -----------------------------------------
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
+ //
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
+ // value and that would overflow.
+ // The low 32-bit value is Op1L*Op0L.
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
+
+ Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Op1L_Op0H =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
+ .add(Op1L)
+ .add(Op0H);
+
+ Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Op1H_Op0L =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
+ .add(Op1H)
+ .add(Op0L);
+
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Carry =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
+ .add(Op1L)
+ .add(Op0L);
+
+ MachineInstr *LoHalf =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
+ .add(Op1L)
+ .add(Op0L);
+
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
+ .addReg(Op1L_Op0H_Reg)
+ .addReg(Op1H_Op0L_Reg);
+
+ MachineInstr *HiHalf =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
+ .addReg(AddReg)
+ .addReg(CarryReg);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(*Op1L_Op0H, MDT);
+ legalizeOperands(*Op1H_Op0L, MDT);
+ legalizeOperands(*Carry, MDT);
+ legalizeOperands(*LoHalf, MDT);
+ legalizeOperands(*Add, MDT);
+ legalizeOperands(*HiHalf, MDT);
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
+// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
+// multiplications.
+void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+ const TargetRegisterClass *Src0SubRC =
+ RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src0SubRC))
+ Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
+ const TargetRegisterClass *Src1SubRC =
+ RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src1SubRC))
+ Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
+
+ // First, we extract the low 32-bit and high 32-bit values from each of the
+ // operands.
+ MachineOperand Op0L =
+ buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Op1L =
+ buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ unsigned Opc = Inst.getOpcode();
+ unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
+ ? AMDGPU::V_MUL_HI_U32_e64
+ : AMDGPU::V_MUL_HI_I32_e64;
+ MachineInstr *HiHalf =
+ BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
+
+ MachineInstr *LoHalf =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
+ .add(Op1L)
+ .add(Op0L);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(*HiHalf, MDT);
+ legalizeOperands(*LoHalf, MDT);
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
MachineInstr &Inst, unsigned Opcode,
MachineDominatorTree *MDT) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 46eee6fae0a52a..37ee159362a28c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -138,6 +138,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned Opcode,
MachineDominatorTree *MDT = nullptr) const;
+ void splitScalarSMulU64(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const;
+
+ void splitScalarSMulPseudo(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const;
+
void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b0b7854ffc0678..1cd8a37c3aa997 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3853,6 +3853,18 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
let mayStore = 0;
}
+def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
// This is equivalent to the G_INTRINSIC*, but the operands may have
// been legalized depending on the subtarget requirements.
def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c9687ac368d305..5f021307e18eee 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -673,6 +673,16 @@ let SubtargetPredicate = isGFX12Plus in {
let isCommutable = 1;
}
+ // The higher 32-bits of the inputs contain the sign extension bits.
+ def S_MUL_I64_I32_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ >;
+
+ // The higher 32-bits of the inputs are zero.
+ def S_MUL_U64_U32_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ >;
+
} // End SubtargetPredicate = isGFX12Plus
let Uses = [SCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir
index 5b0ed61a3313b8..2bf8649e762427 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir
@@ -1,9 +1,10 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX89,GFX8 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX89,GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX9PLUS,GFX1011 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX9PLUS,GFX1011 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX9PLUS,GFX12 %s
---
name: test_mul_s32
@@ -11,34 +12,13 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; GFX6-LABEL: name: test_mul_s32
- ; GFX6: liveins: $vgpr0, $vgpr1
- ; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX6-NEXT: $vgpr0 = COPY [[MUL]](s32)
- ; GFX8-LABEL: name: test_mul_s32
- ; GFX8: liveins: $vgpr0, $vgpr1
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX8-NEXT: $vgpr0 = COPY [[MUL]](s32)
- ; GFX9-LABEL: name: test_mul_s32
- ; GFX9: liveins: $vgpr0, $vgpr1
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX9-NEXT: $vgpr0 = COPY [[MUL]](s32)
- ; GFX10-LABEL: name: test_mul_s32
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX10-NEXT: $vgpr0 = COPY [[MUL]](s32)
+ ; GCN-LABEL: name: test_mul_s32
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GCN-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
+ ; GCN-NEXT: $vgpr0 = COPY [[MUL]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_MUL %0, %1
@@ -51,50 +31,17 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX6-LABEL: name: test_mul_v2s32
- ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
- ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
- ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
- ; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
- ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32)
- ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
- ; GFX8-LABEL: name: test_mul_v2s32
- ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
- ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
- ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
- ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
- ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32)
- ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
- ; GFX9-LABEL: name: test_mul_v2s32
- ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
- ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32)
- ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
- ; GFX10-LABEL: name: test_mul_v2s32
- ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32)
- ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ; GCN-LABEL: name: test_mul_v2s32
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GCN-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GCN-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
+ ; GCN-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32)
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_MUL %0, %1
@@ -122,54 +69,48 @@ body: |
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; GFX8-LABEL: name: test_mul_s64
- ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
- ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32)
- ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; GFX9-LABEL: name: test_mul_s64
- ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32)
- ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; GFX10-LABEL: name: test_mul_s64
- ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
- ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
- ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]]
- ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32)
- ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ;
+ ; GFX89-LABEL: name: test_mul_s64
+ ; GFX89: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX89-NEXT: {{ $}}
+ ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX89-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
+ ; GFX89-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX89-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]]
+ ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
+ ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32)
+ ; GFX89-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ;
+ ; GFX1011-LABEL: name: test_mul_s64
+ ; GFX1011: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1011-NEXT: {{ $}}
+ ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX1011-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
+ ; GFX1011-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
+ ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]]
+ ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
+ ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]]
+ ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32)
+ ; GFX1011-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ;
+ ; GFX12-LABEL: name: test_mul_s64
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[MUL]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_MUL %0, %1
@@ -209,90 +150,76 @@ body: |
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL3]](s32), [[ADD3]](s32)
; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
- ; GFX8-LABEL: name: test_mul_v2s64
- ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
- ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
- ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
- ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]]
- ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV9]](s32)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV7]], [[ANYEXT]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV5]](s32), [[UV6]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[UV10]](s32)
- ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
- ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV14]], [[C]]
- ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64)
- ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV17]](s32)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV15]], [[ANYEXT1]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV13]](s32), [[UV14]], [[AMDGPU_MAD_U64_U32_8]]
- ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64)
- ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV16]](s32), [[UV18]](s32)
- ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
- ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
- ; GFX9-LABEL: name: test_mul_v2s64
- ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]]
- ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV9]](s32)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV7]], [[ANYEXT]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV5]](s32), [[UV6]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[UV10]](s32)
- ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
- ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV14]], [[C]]
- ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV17]](s32)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV15]], [[ANYEXT1]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV13]](s32), [[UV14]], [[AMDGPU_MAD_U64_U32_8]]
- ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64)
- ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV16]](s32), [[UV18]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
- ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
- ; GFX10-LABEL: name: test_mul_v2s64
- ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
- ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
- ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]]
- ; GFX10-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV4]], [[UV7]]
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV9]], [[MUL]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV5]], [[UV6]]
- ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]]
- ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[ADD1]](s32)
- ; GFX10-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
- ; GFX10-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV10]](s32), [[UV12]], [[C]]
- ; GFX10-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_2]](s64)
- ; GFX10-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UV13]]
- ; GFX10-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV15]], [[MUL2]]
- ; GFX10-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UV12]]
- ; GFX10-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[MUL3]]
- ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV14]](s32), [[ADD3]](s32)
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
- ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
+ ; GFX89-LABEL: name: test_mul_v2s64
+ ; GFX89: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX89-NEXT: {{ $}}
+ ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+ ; GFX89-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
+ ; GFX89-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
+ ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
+ ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]]
+ ; GFX89-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX89-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV9]](s32)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV7]], [[ANYEXT]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV5]](s32), [[UV6]], [[AMDGPU_MAD_U64_U32_2]]
+ ; GFX89-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
+ ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[UV10]](s32)
+ ; GFX89-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+ ; GFX89-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV14]], [[C]]
+ ; GFX89-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64)
+ ; GFX89-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV17]](s32)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV15]], [[ANYEXT1]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV13]](s32), [[UV14]], [[AMDGPU_MAD_U64_U32_8]]
+ ; GFX89-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64)
+ ; GFX89-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV16]](s32), [[UV18]](s32)
+ ; GFX89-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+ ; GFX89-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
+ ; GFX1011-LABEL: name: test_mul_v2s64
+ ; GFX1011: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX1011-NEXT: {{ $}}
+ ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+ ; GFX1011-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
+ ; GFX1011-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
+ ; GFX1011-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
+ ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]]
+ ; GFX1011-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV4]], [[UV7]]
+ ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV9]], [[MUL]]
+ ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV5]], [[UV6]]
+ ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]]
+ ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[ADD1]](s32)
+ ; GFX1011-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+ ; GFX1011-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV10]](s32), [[UV12]], [[C]]
+ ; GFX1011-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_2]](s64)
+ ; GFX1011-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UV13]]
+ ; GFX1011-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV15]], [[MUL2]]
+ ; GFX1011-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UV12]]
+ ; GFX1011-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[MUL3]]
+ ; GFX1011-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV14]](s32), [[ADD3]](s32)
+ ; GFX1011-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+ ; GFX1011-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
+ ; GFX12-LABEL: name: test_mul_v2s64
+ ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+ ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
+ ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[UV]], [[UV2]]
+ ; GFX12-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[UV1]], [[UV3]]
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL1]](s64)
+ ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
%2:_(<2 x s64>) = G_MUL %0, %1
@@ -314,36 +241,17 @@ body: |
; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]]
; GFX6-NEXT: $vgpr0 = COPY [[AND]](s32)
- ; GFX8-LABEL: name: test_mul_s16
- ; GFX8: liveins: $vgpr0, $vgpr1
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
- ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16)
- ; GFX8-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
- ; GFX9-LABEL: name: test_mul_s16
- ; GFX9: liveins: $vgpr0, $vgpr1
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
- ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16)
- ; GFX9-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
- ; GFX10-LABEL: name: test_mul_s16
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
- ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16)
- ; GFX10-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
+ ;
+ ; GFX8PLUS-LABEL: name: test_mul_s16
+ ; GFX8PLUS: liveins: $vgpr0, $vgpr1
+ ; GFX8PLUS-NEXT: {{ $}}
+ ; GFX8PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX8PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX8PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX8PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX8PLUS-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
+ ; GFX8PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16)
+ ; GFX8PLUS-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -378,6 +286,7 @@ body: |
; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ;
; GFX8-LABEL: name: test_mul_v2s16
; GFX8: liveins: $vgpr0, $vgpr1
; GFX8-NEXT: {{ $}}
@@ -400,20 +309,14 @@ body: |
; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
- ; GFX9-LABEL: name: test_mul_v2s16
- ; GFX9: liveins: $vgpr0, $vgpr1
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[COPY]], [[COPY1]]
- ; GFX9-NEXT: $vgpr0 = COPY [[MUL]](<2 x s16>)
- ; GFX10-LABEL: name: test_mul_v2s16
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[COPY]], [[COPY1]]
- ; GFX10-NEXT: $vgpr0 = COPY [[MUL]](<2 x s16>)
+ ;
+ ; GFX9PLUS-LABEL: name: test_mul_v2s16
+ ; GFX9PLUS: liveins: $vgpr0, $vgpr1
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9PLUS-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[COPY]], [[COPY1]]
+ ; GFX9PLUS-NEXT: $vgpr0 = COPY [[MUL]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_MUL %0, %1
@@ -441,6 +344,7 @@ body: |
; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY5]]
; GFX6-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[MUL2]](s32)
; GFX6-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16)
+ ;
; GFX8-LABEL: name: test_mul_v3s16
; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX8-NEXT: {{ $}}
@@ -460,66 +364,37 @@ body: |
; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[TRUNC1]], [[TRUNC4]]
; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s16) = G_MUL [[TRUNC2]], [[TRUNC5]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[MUL]](s16), implicit [[MUL1]](s16), implicit [[MUL2]](s16)
- ; GFX9-LABEL: name: test_mul_v3s16
- ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
- ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32)
- ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32)
- ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]]
- ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]]
- ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>)
- ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>)
- ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16)
- ; GFX10-LABEL: name: test_mul_v3s16
- ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
- ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32)
- ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32)
- ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]]
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16)
+ ;
+ ; GFX9PLUS-LABEL: name: test_mul_v3s16
+ ; GFX9PLUS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9PLUS-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9PLUS-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+ ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32)
+ ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32)
+ ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16)
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16)
+ ; GFX9PLUS-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]]
+ ; GFX9PLUS-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]]
+ ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>)
+ ; GFX9PLUS-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; GFX9PLUS-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>)
+ ; GFX9PLUS-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX9PLUS-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -578,6 +453,7 @@ body: |
; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; GFX8-LABEL: name: test_mul_v4s16
; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX8-NEXT: {{ $}}
@@ -618,28 +494,18 @@ body: |
; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
- ; GFX9-LABEL: name: test_mul_v4s16
- ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV]], [[UV2]]
- ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV1]], [[UV3]]
- ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[MUL]](<2 x s16>), [[MUL1]](<2 x s16>)
- ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
- ; GFX10-LABEL: name: test_mul_v4s16
- ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV]], [[UV2]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV1]], [[UV3]]
- ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[MUL]](<2 x s16>), [[MUL1]](<2 x s16>)
- ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX9PLUS-LABEL: name: test_mul_v4s16
+ ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX9PLUS-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV]], [[UV2]]
+ ; GFX9PLUS-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV1]], [[UV3]]
+ ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[MUL]](<2 x s16>), [[MUL1]](<2 x s16>)
+ ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_MUL %0, %1
@@ -652,34 +518,13 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; GFX6-LABEL: name: test_mul_s24
- ; GFX6: liveins: $vgpr0, $vgpr1
- ; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX6-NEXT: $vgpr0 = COPY [[MUL]](s32)
- ; GFX8-LABEL: name: test_mul_s24
- ; GFX8: liveins: $vgpr0, $vgpr1
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX8-NEXT: $vgpr0 = COPY [[MUL]](s32)
- ; GFX9-LABEL: name: test_mul_s24
- ; GFX9: liveins: $vgpr0, $vgpr1
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX9-NEXT: $vgpr0 = COPY [[MUL]](s32)
- ; GFX10-LABEL: name: test_mul_s24
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
- ; GFX10-NEXT: $vgpr0 = COPY [[MUL]](s32)
+ ; GCN-LABEL: name: test_mul_s24
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GCN-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
+ ; GCN-NEXT: $vgpr0 = COPY [[MUL]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s24) = G_TRUNC %0
@@ -709,54 +554,48 @@ body: |
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; GFX8-LABEL: name: test_mul_s33
- ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
- ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32)
- ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; GFX9-LABEL: name: test_mul_s33
- ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32)
- ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; GFX10-LABEL: name: test_mul_s33
- ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
- ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
- ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]]
- ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32)
- ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ;
+ ; GFX89-LABEL: name: test_mul_s33
+ ; GFX89: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX89-NEXT: {{ $}}
+ ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX89-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
+ ; GFX89-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX89-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]]
+ ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
+ ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32)
+ ; GFX89-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ;
+ ; GFX1011-LABEL: name: test_mul_s33
+ ; GFX1011: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1011-NEXT: {{ $}}
+ ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX1011-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]]
+ ; GFX1011-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
+ ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]]
+ ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
+ ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]]
+ ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32)
+ ; GFX1011-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ;
+ ; GFX12-LABEL: name: test_mul_s33
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[MUL]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s33) = G_TRUNC %0
@@ -800,67 +639,71 @@ body: |
; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
- ; GFX8-LABEL: name: test_mul_s96
- ; GFX8: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
- ; GFX8-NEXT: {{ $}}
- ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
- ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
- ; GFX8-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
- ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]]
- ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV5]], [[C]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV4]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV2]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_4]]
- ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64)
- ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[UV8]](s32)
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]]
- ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_8]]
- ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64)
- ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV10]](s32), [[UV11]](s32)
- ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96)
- ; GFX9-LABEL: name: test_mul_s96
- ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
- ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]]
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV5]], [[C]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV4]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV2]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_4]]
- ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64)
- ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[UV8]](s32)
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]]
- ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_8]]
- ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64)
- ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV10]](s32), [[UV11]](s32)
- ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96)
- ; GFX10-LABEL: name: test_mul_s96
- ; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
- ; GFX10-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]]
- ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
- ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]]
- ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]]
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]]
- ; GFX10-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]]
- ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL2]]
- ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[ADD1]](s32)
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]]
- ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_2]]
- ; GFX10-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
- ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV8]](s32), [[UV9]](s32)
- ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96)
+ ;
+ ; GFX89-LABEL: name: test_mul_s96
+ ; GFX89: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX89-NEXT: {{ $}}
+ ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
+ ; GFX89-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
+ ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]]
+ ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV5]], [[C]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV4]], [[AMDGPU_MAD_U64_U32_2]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV2]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_4]]
+ ; GFX89-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64)
+ ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[UV8]](s32)
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]]
+ ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_8]]
+ ; GFX89-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64)
+ ; GFX89-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV10]](s32), [[UV11]](s32)
+ ; GFX89-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96)
+ ;
+ ; GFX1011-LABEL: name: test_mul_s96
+ ; GFX1011: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1011-NEXT: {{ $}}
+ ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
+ ; GFX1011-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
+ ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]]
+ ; GFX1011-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]]
+ ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]]
+ ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]]
+ ; GFX1011-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]]
+ ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL2]]
+ ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[ADD1]](s32)
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]]
+ ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_2]]
+ ; GFX1011-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
+ ; GFX1011-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV8]](s32), [[UV9]](s32)
+ ; GFX1011-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96)
+ ;
+ ; GFX12-LABEL: name: test_mul_s96
+ ; GFX12: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
+ ; GFX12-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96)
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; GFX12-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]]
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64)
+ ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]]
+ ; GFX12-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]]
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]]
+ ; GFX12-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]]
+ ; GFX12-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL2]]
+ ; GFX12-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[ADD1]](s32)
+ ; GFX12-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]]
+ ; GFX12-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_2]]
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64)
+ ; GFX12-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV8]](s32), [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96)
%0:_(s96) = COPY $vgpr0_vgpr1_vgpr2
%1:_(s96) = COPY $vgpr3_vgpr4_vgpr5
%2:_(s96) = G_MUL %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index eb3f74be71de01..0840f58ecd1a61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -4,6 +4,7 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
@@ -31,6 +32,14 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mul_i32 s0, s0, s1
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -61,6 +70,12 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -95,6 +110,15 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i16_zeroext:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mul_i32 s0, s0, s1
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -125,6 +149,14 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i16_zeroext:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -159,6 +191,15 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i16_signext:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mul_i32 s0, s0, s1
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -193,6 +234,14 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i16_signext:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -207,6 +256,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_i32 s0, s0, s1
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
@@ -223,6 +277,12 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
@@ -239,6 +299,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_i32 s0, s0, s2
+; GFX12-NEXT: s_mul_i32 s1, s1, s3
+; GFX12-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
@@ -257,6 +323,13 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
@@ -308,6 +381,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i33:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
ret i33 %result
}
@@ -359,6 +437,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
@@ -394,6 +477,17 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
+; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_add3_u32 v1, v4, v3, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
@@ -490,6 +584,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0
; GFX10PLUS-NEXT: s_mov_b32 s0, s5
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i96:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_i32 s6, s0, s5
+; GFX12-NEXT: s_mul_i32 s7, s1, s4
+; GFX12-NEXT: s_mul_i32 s2, s2, s3
+; GFX12-NEXT: s_add_co_i32 s6, s6, s7
+; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
+; GFX12-NEXT: s_add_co_i32 s6, s6, s2
+; GFX12-NEXT: s_mul_i32 s2, s0, s4
+; GFX12-NEXT: s_mul_i32 s5, s0, s3
+; GFX12-NEXT: s_mul_hi_u32 s0, s0, s4
+; GFX12-NEXT: s_add_co_u32 s2, s2, s7
+; GFX12-NEXT: s_mul_i32 s4, s1, s3
+; GFX12-NEXT: s_add_co_ci_u32 s0, s0, s6
+; GFX12-NEXT: s_mul_hi_u32 s3, s1, s3
+; GFX12-NEXT: s_add_co_u32 s1, s4, s2
+; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0
+; GFX12-NEXT: s_mov_b32 s0, s5
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
@@ -536,6 +650,22 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i96:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX12-NEXT: v_mul_lo_u32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mul_lo_u32 v5, v6, v5
+; GFX12-NEXT: v_mul_lo_u32 v8, v7, v4
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v2, v5, v8, v2
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
@@ -709,6 +839,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX10PLUS-NEXT: s_mov_b32 s1, s8
; GFX10PLUS-NEXT: s_mov_b32 s2, s7
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_i32 s9, s0, s6
+; GFX12-NEXT: s_mul_i32 s11, s1, s5
+; GFX12-NEXT: s_mul_hi_u32 s10, s0, s6
+; GFX12-NEXT: s_mul_hi_u32 s12, s1, s5
+; GFX12-NEXT: s_add_co_u32 s9, s11, s9
+; GFX12-NEXT: s_mul_i32 s11, s2, s4
+; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10
+; GFX12-NEXT: s_mul_hi_u32 s12, s2, s4
+; GFX12-NEXT: s_mul_hi_u32 s8, s0, s4
+; GFX12-NEXT: s_add_co_u32 s9, s11, s9
+; GFX12-NEXT: s_mul_i32 s11, s0, s5
+; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10
+; GFX12-NEXT: s_mul_hi_u32 s12, s0, s5
+; GFX12-NEXT: s_add_co_u32 s8, s11, s8
+; GFX12-NEXT: s_add_co_ci_u32 s9, s12, s9
+; GFX12-NEXT: s_mul_i32 s12, s1, s4
+; GFX12-NEXT: s_mul_hi_u32 s13, s1, s4
+; GFX12-NEXT: s_cselect_b32 s11, 1, 0
+; GFX12-NEXT: s_add_co_u32 s8, s12, s8
+; GFX12-NEXT: s_mul_i32 s12, s0, s7
+; GFX12-NEXT: s_add_co_ci_u32 s7, s13, s9
+; GFX12-NEXT: s_add_co_ci_u32 s9, s10, s12
+; GFX12-NEXT: s_mul_i32 s1, s1, s6
+; GFX12-NEXT: s_cmp_lg_u32 s11, 0
+; GFX12-NEXT: s_mul_i32 s2, s2, s5
+; GFX12-NEXT: s_add_co_ci_u32 s1, s9, s1
+; GFX12-NEXT: s_mul_i32 s3, s3, s4
+; GFX12-NEXT: s_add_co_i32 s1, s1, s2
+; GFX12-NEXT: s_mul_i32 s0, s0, s4
+; GFX12-NEXT: s_add_co_i32 s3, s1, s3
+; GFX12-NEXT: s_mov_b32 s1, s8
+; GFX12-NEXT: s_mov_b32 s2, s7
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
@@ -820,6 +986,32 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX11-NEXT: v_add3_u32 v3, v4, v5, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
+; GFX12-NEXT: v_mov_b32_e32 v10, v2
+; GFX12-NEXT: v_mul_lo_u32 v3, v3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
+; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7
+; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT: v_mov_b32_e32 v2, v11
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT: v_mul_lo_u32 v5, v10, v5
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v3, v4, v5, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
@@ -1625,6 +1817,185 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7
; GFX10PLUS-NEXT: s_mov_b32 s1, s16
; GFX10PLUS-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_mul_i256:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mul_i32 s17, s0, s10
+; GFX12-NEXT: s_mul_i32 s19, s1, s9
+; GFX12-NEXT: s_mul_hi_u32 s18, s0, s10
+; GFX12-NEXT: s_mul_hi_u32 s20, s1, s9
+; GFX12-NEXT: s_add_co_u32 s17, s19, s17
+; GFX12-NEXT: s_add_co_ci_u32 s18, s20, s18
+; GFX12-NEXT: s_mul_i32 s20, s2, s8
+; GFX12-NEXT: s_mul_hi_u32 s21, s2, s8
+; GFX12-NEXT: s_cselect_b32 s19, 1, 0
+; GFX12-NEXT: s_add_co_u32 s17, s20, s17
+; GFX12-NEXT: s_mul_hi_u32 s16, s0, s8
+; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18
+; GFX12-NEXT: s_mul_i32 s21, s0, s9
+; GFX12-NEXT: s_mul_hi_u32 s22, s0, s9
+; GFX12-NEXT: s_cselect_b32 s20, 1, 0
+; GFX12-NEXT: s_add_co_u32 s16, s21, s16
+; GFX12-NEXT: s_add_co_ci_u32 s17, s22, s17
+; GFX12-NEXT: s_mul_i32 s22, s1, s8
+; GFX12-NEXT: s_mul_hi_u32 s23, s1, s8
+; GFX12-NEXT: s_cselect_b32 s21, 1, 0
+; GFX12-NEXT: s_add_co_u32 s16, s22, s16
+; GFX12-NEXT: s_add_co_ci_u32 s17, s23, s17
+; GFX12-NEXT: s_mul_i32 s23, s0, s12
+; GFX12-NEXT: s_mul_i32 s25, s1, s11
+; GFX12-NEXT: s_mul_hi_u32 s24, s0, s12
+; GFX12-NEXT: s_mul_hi_u32 s26, s1, s11
+; GFX12-NEXT: s_cselect_b32 s22, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s25, s23
+; GFX12-NEXT: s_add_co_ci_u32 s24, s26, s24
+; GFX12-NEXT: s_mul_i32 s26, s2, s10
+; GFX12-NEXT: s_mul_hi_u32 s27, s2, s10
+; GFX12-NEXT: s_cselect_b32 s25, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s26, s23
+; GFX12-NEXT: s_add_co_ci_u32 s24, s27, s24
+; GFX12-NEXT: s_mul_i32 s27, s3, s9
+; GFX12-NEXT: s_mul_hi_u32 s28, s3, s9
+; GFX12-NEXT: s_cselect_b32 s26, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s27, s23
+; GFX12-NEXT: s_add_co_ci_u32 s24, s28, s24
+; GFX12-NEXT: s_mul_i32 s28, s4, s8
+; GFX12-NEXT: s_mul_hi_u32 s29, s4, s8
+; GFX12-NEXT: s_cselect_b32 s27, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s28, s23
+; GFX12-NEXT: s_add_co_ci_u32 s24, s29, s24
+; GFX12-NEXT: s_mul_i32 s29, s0, s11
+; GFX12-NEXT: s_mul_hi_u32 s30, s0, s11
+; GFX12-NEXT: s_cselect_b32 s28, 1, 0
+; GFX12-NEXT: s_add_co_u32 s18, s29, s18
+; GFX12-NEXT: s_add_co_ci_u32 s23, s30, s23
+; GFX12-NEXT: s_mul_i32 s30, s1, s10
+; GFX12-NEXT: s_mul_hi_u32 s31, s1, s10
+; GFX12-NEXT: s_cselect_b32 s29, 1, 0
+; GFX12-NEXT: s_add_co_u32 s18, s30, s18
+; GFX12-NEXT: s_add_co_ci_u32 s23, s31, s23
+; GFX12-NEXT: s_mul_i32 s31, s2, s9
+; GFX12-NEXT: s_mul_hi_u32 s33, s2, s9
+; GFX12-NEXT: s_cselect_b32 s30, 1, 0
+; GFX12-NEXT: s_add_co_u32 s18, s31, s18
+; GFX12-NEXT: s_add_co_ci_u32 s23, s33, s23
+; GFX12-NEXT: s_mul_i32 s33, s3, s8
+; GFX12-NEXT: s_mul_hi_u32 s34, s3, s8
+; GFX12-NEXT: s_cselect_b32 s31, 1, 0
+; GFX12-NEXT: s_add_co_u32 s18, s33, s18
+; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23
+; GFX12-NEXT: s_cselect_b32 s33, 1, 0
+; GFX12-NEXT: s_cmp_lg_u32 s22, 0
+; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14
+; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18
+; GFX12-NEXT: s_cselect_b32 s21, 1, 0
+; GFX12-NEXT: s_cmp_lg_u32 s20, 0
+; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13
+; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0
+; GFX12-NEXT: s_cmp_lg_u32 s21, 0
+; GFX12-NEXT: s_mul_i32 s21, s0, s14
+; GFX12-NEXT: s_add_co_ci_u32 s19, s19, s23
+; GFX12-NEXT: s_mul_i32 s23, s1, s13
+; GFX12-NEXT: s_cselect_b32 s20, 1, 0
+; GFX12-NEXT: s_add_co_u32 s21, s23, s21
+; GFX12-NEXT: s_mul_i32 s23, s2, s12
+; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX12-NEXT: s_mul_hi_u32 s34, s2, s12
+; GFX12-NEXT: s_add_co_u32 s21, s23, s21
+; GFX12-NEXT: s_mul_i32 s23, s3, s11
+; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX12-NEXT: s_mul_hi_u32 s34, s3, s11
+; GFX12-NEXT: s_add_co_u32 s21, s23, s21
+; GFX12-NEXT: s_mul_i32 s23, s4, s10
+; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX12-NEXT: s_mul_hi_u32 s34, s4, s10
+; GFX12-NEXT: s_add_co_u32 s21, s23, s21
+; GFX12-NEXT: s_mul_i32 s23, s5, s9
+; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX12-NEXT: s_mul_hi_u32 s34, s5, s9
+; GFX12-NEXT: s_add_co_u32 s21, s23, s21
+; GFX12-NEXT: s_mul_i32 s23, s6, s8
+; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX12-NEXT: s_mul_hi_u32 s34, s6, s8
+; GFX12-NEXT: s_add_co_u32 s21, s23, s21
+; GFX12-NEXT: s_mul_i32 s23, s0, s13
+; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX12-NEXT: s_mul_hi_u32 s34, s0, s13
+; GFX12-NEXT: s_add_co_u32 s23, s23, s24
+; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21
+; GFX12-NEXT: s_mul_i32 s34, s1, s12
+; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12
+; GFX12-NEXT: s_cselect_b32 s24, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s34, s23
+; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21
+; GFX12-NEXT: s_mul_i32 s35, s2, s11
+; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11
+; GFX12-NEXT: s_cselect_b32 s34, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s35, s23
+; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21
+; GFX12-NEXT: s_mul_i32 s36, s3, s10
+; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10
+; GFX12-NEXT: s_cselect_b32 s35, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s36, s23
+; GFX12-NEXT: s_add_co_ci_u32 s21, s37, s21
+; GFX12-NEXT: s_mul_i32 s37, s4, s9
+; GFX12-NEXT: s_mul_hi_u32 s38, s4, s9
+; GFX12-NEXT: s_cselect_b32 s36, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s37, s23
+; GFX12-NEXT: s_add_co_ci_u32 s21, s38, s21
+; GFX12-NEXT: s_mul_i32 s38, s5, s8
+; GFX12-NEXT: s_mul_hi_u32 s39, s5, s8
+; GFX12-NEXT: s_cselect_b32 s37, 1, 0
+; GFX12-NEXT: s_add_co_u32 s23, s38, s23
+; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21
+; GFX12-NEXT: s_cselect_b32 s38, 1, 0
+; GFX12-NEXT: s_cmp_lg_u32 s30, 0
+; GFX12-NEXT: s_mul_i32 s1, s1, s14
+; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX12-NEXT: s_cmp_lg_u32 s31, 0
+; GFX12-NEXT: s_mul_i32 s2, s2, s13
+; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX12-NEXT: s_cmp_lg_u32 s33, 0
+; GFX12-NEXT: s_mul_i32 s3, s3, s12
+; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX12-NEXT: s_cmp_lg_u32 s20, 0
+; GFX12-NEXT: s_mul_i32 s4, s4, s11
+; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23
+; GFX12-NEXT: s_cselect_b32 s23, 1, 0
+; GFX12-NEXT: s_cmp_lg_u32 s26, 0
+; GFX12-NEXT: s_mul_i32 s26, s0, s15
+; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX12-NEXT: s_cmp_lg_u32 s27, 0
+; GFX12-NEXT: s_mul_i32 s5, s5, s10
+; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX12-NEXT: s_cmp_lg_u32 s28, 0
+; GFX12-NEXT: s_mul_i32 s6, s6, s9
+; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX12-NEXT: s_cmp_lg_u32 s23, 0
+; GFX12-NEXT: s_mul_i32 s7, s7, s8
+; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21
+; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26
+; GFX12-NEXT: s_cmp_lg_u32 s38, 0
+; GFX12-NEXT: s_mul_i32 s0, s0, s8
+; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1
+; GFX12-NEXT: s_cmp_lg_u32 s37, 0
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2
+; GFX12-NEXT: s_cmp_lg_u32 s36, 0
+; GFX12-NEXT: s_mov_b32 s2, s17
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GFX12-NEXT: s_cmp_lg_u32 s35, 0
+; GFX12-NEXT: s_mov_b32 s3, s18
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s4
+; GFX12-NEXT: s_cmp_lg_u32 s34, 0
+; GFX12-NEXT: s_mov_b32 s4, s19
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s5
+; GFX12-NEXT: s_cmp_lg_u32 s24, 0
+; GFX12-NEXT: s_mov_b32 s5, s20
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s6
+; GFX12-NEXT: s_mov_b32 s6, s15
+; GFX12-NEXT: s_add_co_i32 s7, s1, s7
+; GFX12-NEXT: s_mov_b32 s1, s16
+; GFX12-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
@@ -1978,6 +2349,454 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_mul_i256:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
+; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9
+; GFX12-NEXT: v_mul_lo_u32 v7, v7, v8
+; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
+; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
+; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mov_b32_e32 v20, v22
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mov_b32_e32 v19, v22
+; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
+; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
+; GFX12-NEXT: v_mov_b32_e32 v20, v18
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
+; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
+; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
+; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
+; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13
+; GFX12-NEXT: v_mov_b32_e32 v13, v1
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, v21
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v7, v8, v7
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}
+
+define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX7-LABEL: s_mul_u64_zext_with_vregs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_mul_u64_zext_with_vregs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_mul_u64_zext_with_vregs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: global_load_dword v2, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_mul_u64_zext_with_vregs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: global_load_dword v2, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_mul_u64_zext_with_vregs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_mul_u64_zext_with_vregs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = zext i32 %val to i64
+ %mul = mul i64 %ext, 80
+ store i64 %mul, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX7-LABEL: s_mul_u64_zext_with_sregs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT: v_mov_b32_e32 v0, 0x50
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0
+; GFX7-NEXT: s_mul_i32 s4, s3, 0x50
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_mul_u64_zext_with_sregs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT: s_mulk_i32 s2, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_mul_u64_zext_with_sregs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_mul_u64_zext_with_sregs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
+; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_mul_u64_zext_with_sregs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mul_i32 s2, s3, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_mul_u64_zext_with_sregs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = zext i32 %val to i64
+ %mul = mul i64 %ext, 80
+ store i64 %mul, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX7-LABEL: s_mul_u64_sext_with_vregs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_mul_u64_sext_with_vregs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: flat_load_dword v4, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_mul_u64_sext_with_vregs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: global_load_dword v4, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_mul_u64_sext_with_vregs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: global_load_dword v2, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT: v_mul_lo_u32 v4, 0x50, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_mul_u64_sext_with_vregs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT: v_mul_lo_u32 v4, 0x50, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_mul_u64_sext_with_vregs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = sext i32 %val to i64
+ %mul = mul i64 %ext, 80
+ store i64 %mul, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX7-LABEL: s_mul_u64_sext_with_sregs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT: v_mov_b32_e32 v0, 0x50
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0
+; GFX7-NEXT: s_ashr_i32 s5, s3, 31
+; GFX7-NEXT: s_mul_i32 s4, s3, 0x50
+; GFX7-NEXT: s_mulk_i32 s5, 0x50
+; GFX7-NEXT: v_readfirstlane_b32 s3, v0
+; GFX7-NEXT: s_add_u32 s5, s5, s3
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_mul_u64_sext_with_sregs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT: s_ashr_i32 s3, s2, 31
+; GFX8-NEXT: s_mulk_i32 s2, 0x50
+; GFX8-NEXT: s_mulk_i32 s3, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_add_u32 s3, s3, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_mul_u64_sext_with_sregs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_ashr_i32 s4, s3, 31
+; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
+; GFX9-NEXT: s_mulk_i32 s4, 0x50
+; GFX9-NEXT: s_add_u32 s3, s4, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_mul_u64_sext_with_sregs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_ashr_i32 s3, s2, 31
+; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
+; GFX10-NEXT: s_mulk_i32 s3, 0x50
+; GFX10-NEXT: s_mulk_i32 s2, 0x50
+; GFX10-NEXT: s_add_i32 s3, s4, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_mul_u64_sext_with_sregs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_ashr_i32 s3, s2, 31
+; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50
+; GFX11-NEXT: s_mulk_i32 s3, 0x50
+; GFX11-NEXT: s_mulk_i32 s2, 0x50
+; GFX11-NEXT: s_add_i32 s3, s4, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_mul_u64_sext_with_sregs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = sext i32 %val to i64
+ %mul = mul i64 %ext, 80
+ store i64 %mul, ptr addrspace(1) %out, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir
new file mode 100644
index 00000000000000..f74a575ac931ea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: mul_s64
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: mul_s64
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12345
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[C]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MUL]](s64)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_CONSTANT i64 12345
+ %2:_(s64) = G_MUL %0, %1
+ $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: mul_s64_zext
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; CHECK-LABEL: name: mul_s64_zext
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12345
+ ; CHECK-NEXT: [[AMDGPU_:%[0-9]+]]:_(s64) = G_AMDGPU_S_MUL_U64_U32 [[ZEXT]], [[C]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AMDGPU_]](s64)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s64) = G_ZEXT %0
+ %2:_(s64) = G_CONSTANT i64 12345
+ %3:_(s64) = G_MUL %1, %2
+ $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: mul_s64_sext
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; CHECK-LABEL: name: mul_s64_sext
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12345
+ ; CHECK-NEXT: [[AMDGPU_:%[0-9]+]]:_(s64) = G_AMDGPU_S_MUL_I64_I32 [[SEXT]], [[C]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AMDGPU_]](s64)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s64) = G_SEXT %0
+ %2:_(s64) = G_CONSTANT i64 12345
+ %3:_(s64) = G_MUL %1, %2
+ $vgpr0_vgpr1 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir
index a5b61641e0c267..a6cc6c92d9f8a5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir
@@ -74,3 +74,125 @@ body: |
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_MUL %0, %1
...
+
+---
+name: mul_s64_ss
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: mul_s64_ss
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:sgpr(s64) = G_MUL [[COPY]], [[COPY1]]
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = COPY $sgpr2_sgpr3
+ %2:_(s64) = G_MUL %0, %1
+...
+
+---
+name: mul_s64_vv
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-LABEL: name: mul_s64_vv
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:vgpr(s32) = G_UMULH [[UV]], [[UV2]]
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[UV]], [[UV3]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UMULH]], [[MUL]]
+ ; CHECK-NEXT: [[MUL1:%[0-9]+]]:vgpr(s32) = G_MUL [[UV1]], [[UV2]]
+ ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[MUL1]]
+ ; CHECK-NEXT: [[MUL2:%[0-9]+]]:vgpr(s32) = G_MUL [[UV]], [[UV2]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[MUL2]](s32), [[ADD1]](s32)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s64) = G_MUL %0, %1
+...
+
+---
+name: mul_s64_zext_ss
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: mul_s64_zext_ss
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_MUL_U64_:%[0-9]+]]:sgpr_64(s64) = S_MUL_U64 [[COPY]](s64), [[COPY1]](s64)
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = COPY $sgpr2_sgpr3
+ %2:_(s64) = G_AMDGPU_S_MUL_U64_U32 %0, %1
+...
+
+---
+name: mul_s64_zext_vv
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-LABEL: name: mul_s64_zext_vv
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vreg_64(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:vgpr(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:vreg_64 = G_AMDGPU_MAD_U64_U32 [[TRUNC]](s32), [[TRUNC1]], [[C]]
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s64) = G_AMDGPU_S_MUL_U64_U32 %0, %1
+...
+
+---
+name: mul_s64_sext_ss
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: mul_s64_sext_ss
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_MUL_U64_:%[0-9]+]]:sgpr_64(s64) = S_MUL_U64 [[COPY]](s64), [[COPY1]](s64)
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = COPY $sgpr2_sgpr3
+ %2:_(s64) = G_AMDGPU_S_MUL_I64_I32 %0, %1
+...
+
+---
+name: mul_s64_sext_vv
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-LABEL: name: mul_s64_sext_vv
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vreg_64(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[AMDGPU_MAD_I64_I32_:%[0-9]+]]:vgpr(s64), [[AMDGPU_MAD_I64_I32_1:%[0-9]+]]:vreg_64 = G_AMDGPU_MAD_I64_I32 [[TRUNC]](s32), [[TRUNC1]], [[C]]
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s64) = G_AMDGPU_S_MUL_I64_I32 %0, %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 26d981ad7b4bfe..b4c8da44337ae5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1259,20 +1259,21 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_mov_b32 s9, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1264-NEXT: v_mov_b32_e32 v1, 0
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s7
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -1296,19 +1297,20 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-LABEL: add_i64_constant:
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1232-NEXT: s_mov_b32 s5, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
+; GFX1232-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
@@ -1316,7 +1318,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_waitcnt vmcnt(0)
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB3_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
@@ -1643,23 +1645,21 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB4_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1264-NEXT: s_mul_i32 s9, s1, s8
-; GFX1264-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1264-NEXT: s_mul_i32 s8, s0, s8
-; GFX1264-NEXT: s_add_co_i32 s10, s10, s9
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
-; GFX1264-NEXT: v_mov_b32_e32 v1, s10
+; GFX1264-NEXT: v_mov_b32_e32 v1, s9
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_mov_b32 s8, s6
; GFX1264-NEXT: s_mov_b32 s9, s7
@@ -1687,31 +1687,28 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s3, exec_lo
; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s3, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB4_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1232-NEXT: s_mul_i32 s8, s1, s3
-; GFX1232-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1232-NEXT: s_mul_i32 s3, s0, s3
-; GFX1232-NEXT: s_add_co_i32 s9, s9, s8
-; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1232-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: s_mov_b32 s8, s6
-; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mov_b32 s14, -1
+; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: s_mov_b32 s12, s6
+; GFX1232-NEXT: s_mov_b32 s13, s7
+; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_waitcnt vmcnt(0)
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
@@ -3182,20 +3179,21 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_mov_b32 s9, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1264-NEXT: v_mov_b32_e32 v1, 0
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s7
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -3222,19 +3220,20 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-LABEL: sub_i64_constant:
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1232-NEXT: s_mov_b32 s5, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
+; GFX1232-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
@@ -3242,7 +3241,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_waitcnt vmcnt(0)
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB9_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -3585,23 +3584,21 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB10_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1264-NEXT: s_mul_i32 s9, s1, s8
-; GFX1264-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1264-NEXT: s_mul_i32 s8, s0, s8
-; GFX1264-NEXT: s_add_co_i32 s10, s10, s9
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
-; GFX1264-NEXT: v_mov_b32_e32 v1, s10
+; GFX1264-NEXT: v_mov_b32_e32 v1, s9
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_mov_b32 s8, s6
; GFX1264-NEXT: s_mov_b32 s9, s7
@@ -3632,31 +3629,28 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s3, exec_lo
; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s3, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB10_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1232-NEXT: s_mul_i32 s8, s1, s3
-; GFX1232-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1232-NEXT: s_mul_i32 s3, s0, s3
-; GFX1232-NEXT: s_add_co_i32 s9, s9, s8
-; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1232-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: s_mov_b32 s8, s6
-; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mov_b32 s14, -1
+; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: s_mov_b32 s12, s6
+; GFX1232-NEXT: s_mov_b32 s13, s7
+; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_waitcnt vmcnt(0)
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB10_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
; GFX1232-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 5e90c33f3c8cb7..e2617fc453b58f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -4,6 +4,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
; mul24 and mad24 are affected
@@ -106,6 +107,27 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: test_mul_v2i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: test_mul_v2i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -247,6 +269,31 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul_v4i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7
+; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6
+; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_v4i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -351,6 +398,21 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: s_trunc_i64_mul_to_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mul_i32 s0, s0, s6
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: s_trunc_i64_mul_to_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -483,6 +545,31 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_trunc_i64_mul_to_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s2, s10
+; GFX12-NEXT: s_mov_b32 s3, s11
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s12, s6
+; GFX12-NEXT: s_mov_b32 s13, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null
+; GFX12-NEXT: s_mov_b32 s8, s4
+; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_trunc_i64_mul_to_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -587,6 +674,21 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: mul64_sext_c:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: mul64_sext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -606,6 +708,113 @@ entry:
ret void
}
+define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: mul64_zext_c:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: v_mov_b32_e32 v0, 0x50
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_hi_u32 v1, s4, v0
+; SI-NEXT: s_mulk_i32 s4, 0x50
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: mul64_zext_c:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: mul64_zext_c:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50
+; GFX9-NEXT: s_mulk_i32 s2, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: mul64_zext_c:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: mul64_zext_c:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: mul64_zext_c:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; EG-LABEL: mul64_zext_c:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MULHI * T0.Y, KC0[2].Z, literal.x,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+entry:
+ %0 = zext i32 %in to i64
+ %1 = mul i64 %0, 80
+ store i64 %1, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_mul64_sext_c:
; SI: ; %bb.0: ; %entry
@@ -706,6 +915,27 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul64_sext_c:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0
+; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_sext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -732,6 +962,153 @@ entry:
ret void
}
+define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_mul64_zext_c:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT: s_movk_i32 s2, 0x50
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_hi_u32 v1, v0, s2
+; SI-NEXT: v_mul_lo_u32 v0, v0, s2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_mul64_zext_c:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s2, 0x50
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_mul64_zext_c:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s10, s6
+; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX9-NEXT: s_movk_i32 s2, 0x50
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_mul64_zext_c:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s8, s2
+; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0
+; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_mul64_zext_c:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s2
+; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0
+; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_mul64_zext_c:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0
+; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; EG-LABEL: v_mul64_zext_c:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: MULHI * T0.Y, T0.X, literal.x,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+entry:
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = zext i32 %val to i64
+ %mul = mul i64 %ext, 80
+ store i64 %mul, ptr addrspace(1) %out, align 8
+ ret void
+}
+
define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_mul64_sext_inline_imm:
; SI: ; %bb.0: ; %entry
@@ -829,6 +1206,27 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul64_sext_inline_imm:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0
+; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_sext_inline_imm:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -925,6 +1323,22 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: s_mul_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mul_i32 s2, s2, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1034,6 +1448,26 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1133,6 +1567,23 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: s_mul_i1:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i1:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
@@ -1272,6 +1723,30 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul_i1:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null
+; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i1:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -1418,6 +1893,21 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: s_mul_i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
@@ -1579,6 +2069,37 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul_i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, s10
+; GFX12-NEXT: s_mov_b32 s3, s11
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s12, s6
+; GFX12-NEXT: s_mov_b32 s13, s7
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null
+; GFX12-NEXT: s_mov_b32 s8, s4
+; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
+; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -1616,30 +2137,30 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0
-; SI-NEXT: s_cbranch_scc0 .LBB13_2
+; SI-NEXT: s_cbranch_scc0 .LBB15_2
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mul_i32 s6, s2, s3
; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_branch .LBB13_3
-; SI-NEXT: .LBB13_2:
+; SI-NEXT: s_branch .LBB15_3
+; SI-NEXT: .LBB15_2:
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: .LBB13_3: ; %Flow
+; SI-NEXT: .LBB15_3: ; %Flow
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB13_5
+; SI-NEXT: s_cbranch_vccnz .LBB15_5
; SI-NEXT: ; %bb.4: ; %if
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; SI-NEXT: s_branch .LBB13_6
-; SI-NEXT: .LBB13_5:
+; SI-NEXT: s_branch .LBB15_6
+; SI-NEXT: .LBB15_5:
; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: .LBB13_6: ; %endif
+; SI-NEXT: .LBB15_6: ; %endif
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1651,18 +2172,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cbranch_scc0 .LBB13_2
+; VI-NEXT: s_cbranch_scc0 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mul_i32 s6, s2, s3
; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: s_branch .LBB13_3
-; VI-NEXT: .LBB13_2:
+; VI-NEXT: s_branch .LBB15_3
+; VI-NEXT: .LBB15_2:
; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: ; implicit-def: $sgpr6
-; VI-NEXT: .LBB13_3: ; %Flow
+; VI-NEXT: .LBB15_3: ; %Flow
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT: s_cbranch_vccnz .LBB13_5
+; VI-NEXT: s_cbranch_vccnz .LBB15_5
; VI-NEXT: ; %bb.4: ; %if
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
@@ -1670,10 +2191,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT: s_branch .LBB13_6
-; VI-NEXT: .LBB13_5:
+; VI-NEXT: s_branch .LBB15_6
+; VI-NEXT: .LBB15_5:
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: .LBB13_6: ; %endif
+; VI-NEXT: .LBB15_6: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
@@ -1686,18 +2207,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cbranch_scc0 .LBB13_2
+; GFX9-NEXT: s_cbranch_scc0 .LBB15_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_mul_i32 s6, s2, s3
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: s_branch .LBB13_3
-; GFX9-NEXT: .LBB13_2:
+; GFX9-NEXT: s_branch .LBB15_3
+; GFX9-NEXT: .LBB15_2:
; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: ; implicit-def: $sgpr6
-; GFX9-NEXT: .LBB13_3: ; %Flow
+; GFX9-NEXT: .LBB15_3: ; %Flow
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_vccnz .LBB13_5
+; GFX9-NEXT: s_cbranch_vccnz .LBB15_5
; GFX9-NEXT: ; %bb.4: ; %if
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
@@ -1705,10 +2226,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_branch .LBB13_6
-; GFX9-NEXT: .LBB13_5:
+; GFX9-NEXT: s_branch .LBB15_6
+; GFX9-NEXT: .LBB15_5:
; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: .LBB13_6: ; %endif
+; GFX9-NEXT: .LBB15_6: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -1722,17 +2243,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-NEXT: s_cbranch_scc0 .LBB13_2
+; GFX10-NEXT: s_cbranch_scc0 .LBB15_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_mul_i32 s5, s2, s3
-; GFX10-NEXT: s_branch .LBB13_3
-; GFX10-NEXT: .LBB13_2:
+; GFX10-NEXT: s_branch .LBB15_3
+; GFX10-NEXT: .LBB15_2:
; GFX10-NEXT: s_mov_b32 s4, -1
; GFX10-NEXT: ; implicit-def: $sgpr5
-; GFX10-NEXT: .LBB13_3: ; %Flow
+; GFX10-NEXT: .LBB15_3: ; %Flow
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_vccnz .LBB13_5
+; GFX10-NEXT: s_cbranch_vccnz .LBB15_5
; GFX10-NEXT: ; %bb.4: ; %if
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-NEXT: s_mov_b32 s6, -1
@@ -1740,10 +2261,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; GFX10-NEXT: s_branch .LBB13_6
-; GFX10-NEXT: .LBB13_5:
+; GFX10-NEXT: s_branch .LBB15_6
+; GFX10-NEXT: .LBB15_5:
; GFX10-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-NEXT: .LBB13_6: ; %endif
+; GFX10-NEXT: .LBB15_6: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
@@ -1757,17 +2278,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB13_2
+; GFX11-NEXT: s_cbranch_scc0 .LBB15_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_mul_i32 s5, s2, s3
-; GFX11-NEXT: s_branch .LBB13_3
-; GFX11-NEXT: .LBB13_2:
+; GFX11-NEXT: s_branch .LBB15_3
+; GFX11-NEXT: .LBB15_2:
; GFX11-NEXT: s_mov_b32 s4, -1
; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: .LBB13_3: ; %Flow
+; GFX11-NEXT: .LBB15_3: ; %Flow
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB13_5
+; GFX11-NEXT: s_cbranch_vccnz .LBB15_5
; GFX11-NEXT: ; %bb.4: ; %if
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -1775,10 +2296,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_branch .LBB13_6
-; GFX11-NEXT: .LBB13_5:
+; GFX11-NEXT: s_branch .LBB15_6
+; GFX11-NEXT: .LBB15_5:
; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: .LBB13_6: ; %endif
+; GFX11-NEXT: .LBB15_6: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
@@ -1788,6 +2309,43 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: mul32_in_branch:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc0 .LBB15_2
+; GFX12-NEXT: ; %bb.1: ; %else
+; GFX12-NEXT: s_mul_i32 s5, s2, s3
+; GFX12-NEXT: s_branch .LBB15_3
+; GFX12-NEXT: .LBB15_2:
+; GFX12-NEXT: s_mov_b32 s4, -1
+; GFX12-NEXT: ; implicit-def: $sgpr5
+; GFX12-NEXT: .LBB15_3: ; %Flow
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_vccnz .LBB15_5
+; GFX12-NEXT: ; %bb.4: ; %if
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s4, s2
+; GFX12-NEXT: s_mov_b32 s5, s3
+; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_branch .LBB15_6
+; GFX12-NEXT: .LBB15_5:
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: .LBB15_6: ; %endif
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: mul32_in_branch:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
@@ -1850,7 +2408,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
; SI-NEXT: s_and_b64 vcc, exec, s[10:11]
-; SI-NEXT: s_cbranch_vccz .LBB14_4
+; SI-NEXT: s_cbranch_vccz .LBB16_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mul_hi_u32 v0, s4, v0
@@ -1861,22 +2419,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT: s_cbranch_vccnz .LBB14_3
-; SI-NEXT: .LBB14_2: ; %if
+; SI-NEXT: s_cbranch_vccnz .LBB16_3
+; SI-NEXT: .LBB16_2: ; %if
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT: .LBB14_3: ; %endif
+; SI-NEXT: .LBB16_3: ; %endif
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
-; SI-NEXT: .LBB14_4:
+; SI-NEXT: .LBB16_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
-; SI-NEXT: s_branch .LBB14_2
+; SI-NEXT: s_branch .LBB16_2
;
; VI-LABEL: mul64_in_branch:
; VI: ; %bb.0: ; %entry
@@ -1884,7 +2442,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_mov_b64 s[8:9], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
-; VI-NEXT: s_cbranch_scc0 .LBB14_4
+; VI-NEXT: s_cbranch_scc0 .LBB16_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
@@ -1893,22 +2451,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_mul_i32 s4, s5, s6
; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; VI-NEXT: s_cbranch_vccnz .LBB14_3
-; VI-NEXT: .LBB14_2: ; %if
+; VI-NEXT: s_cbranch_vccnz .LBB16_3
+; VI-NEXT: .LBB16_2: ; %if
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT: .LBB14_3: ; %endif
+; VI-NEXT: .LBB16_3: ; %endif
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
-; VI-NEXT: .LBB14_4:
+; VI-NEXT: .LBB16_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
-; VI-NEXT: s_branch .LBB14_2
+; VI-NEXT: s_branch .LBB16_2
;
; GFX9-LABEL: mul64_in_branch:
; GFX9: ; %bb.0: ; %entry
@@ -1916,7 +2474,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_mov_b64 s[8:9], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_cbranch_scc0 .LBB14_3
+; GFX9-NEXT: s_cbranch_scc0 .LBB16_3
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_mul_i32 s7, s4, s7
; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6
@@ -1925,21 +2483,21 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_add_i32 s5, s7, s5
; GFX9-NEXT: s_mul_i32 s4, s4, s6
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GFX9-NEXT: s_cbranch_vccnz .LBB14_4
-; GFX9-NEXT: .LBB14_2: ; %if
+; GFX9-NEXT: s_cbranch_vccnz .LBB16_4
+; GFX9-NEXT: .LBB16_2: ; %if
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX9-NEXT: s_branch .LBB14_5
-; GFX9-NEXT: .LBB14_3:
+; GFX9-NEXT: s_branch .LBB16_5
+; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-NEXT: s_branch .LBB14_2
-; GFX9-NEXT: .LBB14_4:
+; GFX9-NEXT: s_branch .LBB16_2
+; GFX9-NEXT: .LBB16_4:
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: .LBB14_5: ; %endif
+; GFX9-NEXT: .LBB16_5: ; %endif
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1951,7 +2509,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT: s_cbranch_scc0 .LBB14_3
+; GFX10-NEXT: s_cbranch_scc0 .LBB16_3
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_mul_i32 s7, s4, s7
; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
@@ -1960,22 +2518,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: s_mul_i32 s4, s4, s6
; GFX10-NEXT: s_add_i32 s5, s7, s5
; GFX10-NEXT: s_mov_b32 s6, 0
-; GFX10-NEXT: s_cbranch_execnz .LBB14_4
-; GFX10-NEXT: .LBB14_2: ; %if
+; GFX10-NEXT: s_cbranch_execnz .LBB16_4
+; GFX10-NEXT: .LBB16_2: ; %if
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT: s_branch .LBB14_5
-; GFX10-NEXT: .LBB14_3:
+; GFX10-NEXT: s_branch .LBB16_5
+; GFX10-NEXT: .LBB16_3:
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX10-NEXT: s_branch .LBB14_2
-; GFX10-NEXT: .LBB14_4:
+; GFX10-NEXT: s_branch .LBB16_2
+; GFX10-NEXT: .LBB16_4:
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: .LBB14_5: ; %endif
+; GFX10-NEXT: .LBB16_5: ; %endif
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1987,7 +2545,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB14_3
+; GFX11-NEXT: s_cbranch_scc0 .LBB16_3
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_mul_i32 s7, s4, s7
; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
@@ -1996,21 +2554,21 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_mul_i32 s4, s4, s6
; GFX11-NEXT: s_add_i32 s5, s7, s5
; GFX11-NEXT: s_mov_b32 s6, 0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_4
-; GFX11-NEXT: .LBB14_2: ; %if
+; GFX11-NEXT: s_cbranch_execnz .LBB16_4
+; GFX11-NEXT: .LBB16_2: ; %if
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_branch .LBB14_5
-; GFX11-NEXT: .LBB14_3:
+; GFX11-NEXT: s_branch .LBB16_5
+; GFX11-NEXT: .LBB16_3:
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX11-NEXT: s_branch .LBB14_2
-; GFX11-NEXT: .LBB14_4:
+; GFX11-NEXT: s_branch .LBB16_2
+; GFX11-NEXT: .LBB16_4:
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: .LBB14_5: ; %endif
+; GFX11-NEXT: .LBB16_5: ; %endif
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2019,6 +2577,38 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: mul64_in_branch:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cbranch_scc0 .LBB16_3
+; GFX12-NEXT: ; %bb.1: ; %else
+; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_cbranch_execnz .LBB16_4
+; GFX12-NEXT: .LBB16_2: ; %if
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s4, s2
+; GFX12-NEXT: s_mov_b32 s5, s3
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_branch .LBB16_5
+; GFX12-NEXT: .LBB16_3:
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: s_branch .LBB16_2
+; GFX12-NEXT: .LBB16_4:
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: .LBB16_5: ; %endif
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: mul64_in_branch:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
@@ -2324,6 +2914,51 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: s_mul_i128:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c
+; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s15, s3
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s17, s3
+; GFX12-NEXT: s_mov_b32 s19, s3
+; GFX12-NEXT: s_mov_b32 s24, s3
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s2, s4
+; GFX12-NEXT: s_mov_b32 s14, s8
+; GFX12-NEXT: s_mov_b32 s12, s9
+; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3]
+; GFX12-NEXT: s_mov_b32 s2, s23
+; GFX12-NEXT: s_mov_b32 s16, s5
+; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9]
+; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17]
+; GFX12-NEXT: s_mov_b32 s2, s11
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11]
+; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17]
+; GFX12-NEXT: s_mov_b32 s18, s7
+; GFX12-NEXT: s_mov_b32 s23, s3
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX12-NEXT: s_mov_b32 s25, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3]
+; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25]
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i128:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[]
@@ -2570,6 +3205,44 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
+; GFX12-LABEL: v_mul_i128:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b128 v[0:3], v15, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v15, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
+; GFX12-NEXT: v_mul_lo_u32 v14, v5, v2
+; GFX12-NEXT: v_mul_lo_u32 v3, v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10]
+; GFX12-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v4, v2, 0
+; GFX12-NEXT: v_mul_lo_u32 v4, v6, v1
+; GFX12-NEXT: v_mov_b32_e32 v2, v10
+; GFX12-NEXT: v_mul_lo_u32 v10, v7, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add3_u32 v12, v12, v3, v14
+; GFX12-NEXT: v_add_co_u32 v2, s0, v13, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0
+; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v6, v0, v[11:12]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v1, v5, v[2:3]
+; GFX12-NEXT: v_add3_u32 v0, v10, v14, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13
+; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
+; GFX12-NEXT: global_store_b128 v15, v[8:11], s[2:3]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i128:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
@@ -2672,6 +3345,12 @@ define i32 @mul_pow2_plus_1(i32 %val) {
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: mul_pow2_plus_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; EG-LABEL: mul_pow2_plus_1:
; EG: ; %bb.0:
; EG-NEXT: CF_END
More information about the llvm-commits
mailing list