[llvm] [AMDGPU] Add CodeGen support for GFX12 s_mul_u64 (PR #75825)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 4 04:18:21 PST 2024
================
@@ -2094,6 +2094,75 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
return true;
}
+// Break s_mul_u64 into 32-bit vector operations.
+void AMDGPURegisterBankInfo::applyMappingSMULU64(
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src0Regs.empty() && Src1Regs.empty());
+ applyDefaultMapping(OpdMapper);
+ return;
+ }
+
+ assert(DefRegs.size() == 2);
+ assert(Src0Regs.size() == Src1Regs.size() &&
+ (Src0Regs.empty() || Src0Regs.size() == 2));
+
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ MachineInstr &MI = OpdMapper.getMI();
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT HalfTy = LLT::scalar(32);
+
+ // Depending on where the source registers came from, the generic code may
+ // have decided to split the inputs already or not. If not, we still need to
+ // extract the values.
+
+ if (Src0Regs.empty())
+ split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
+ else
+ setRegsToType(MRI, Src0Regs, HalfTy);
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else
+ setRegsToType(MRI, Src1Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ // The multiplication is done as follows:
+ //
+ // Op1H Op1L
+ // * Op0H Op0L
+ // --------------------
+ // Op1H*Op0L Op1L*Op0L
+ // + Op1H*Op0H Op1L*Op0H
+ // -----------------------------------------
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
+ //
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
+ // value and that would overflow.
+ // The low 32-bit value is Op1L*Op0L.
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
+ // Op1L*Op0L).
+
+ Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
+ MRI.setRegBank(Hi, AMDGPU::VGPRRegBank);
----------------
arsenm wrote:
It's a bit ugly but ApplyRegBankMapping is used in some other places to avoid explicitly inserting setRegBank at every point.
https://github.com/llvm/llvm-project/pull/75825
More information about the llvm-commits
mailing list