[llvm] [AMDGPU] Fold multiple aligned v_mov_b32 to v_mov_b64 on gfx942 (PR #138843)
Fabian Ritter via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 24 03:25:50 PDT 2025
================
@@ -2194,6 +2195,100 @@ bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
return true;
}
+// gfx942+ can use V_MOV_B64 for materializing constant immediates.
+// For example:
+// %0:vgpr_32 = V_MOV_B32 0, implicit $exec
+// %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
+// ->
+// %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
+ assert(MI.isRegSequence());
+ auto Reg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+ const MCInstrDesc &MovDesc = TII->get(AMDGPU::V_MOV_B64_PSEUDO);
+ const TargetRegisterClass *RC =
+ TII->getRegClass(MovDesc, 0, TRI, *MI.getMF());
+
+ if (!ST->hasMovB64() || !TRI->isVGPR(*MRI, Reg) ||
+ !MRI->hasOneNonDBGUse(Reg) ||
+ (!TRI->getCompatibleSubRegClass(DefRC, RC, AMDGPU::sub0_sub1) &&
+ DefRC != RC))
+ return false;
+
+ SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
+ if (!getRegSeqInit(Defs, Reg))
+ return false;
+
+ // Only attempting to fold immediate materializations.
+ if (!Defs.empty() &&
+ std::any_of(Defs.begin(), Defs.end(),
+ [](const std::pair<MachineOperand *, unsigned> &Op) {
+ return !Op.first->isImm();
+ }))
+ return false;
+
+ SmallVector<uint64_t, 8> ImmVals;
+ uint64_t ImmVal = 0;
+ uint64_t ImmSize = 0;
+ for (auto &[Op, SubIdx] : Defs) {
+ unsigned SubRegSize = TRI->getSubRegIdxSize(SubIdx);
+ unsigned Shift = (TRI->getChannelFromSubReg(SubIdx) % 2) * SubRegSize;
+ ImmSize += SubRegSize;
+ ImmVal |= Op->getImm() << Shift;
+
+ if (ImmSize > 64 || SubRegSize == 64)
+ return false;
+
+ if (ImmSize == 64) {
+ // Only 32 bit literals can be encoded.
+ if (!isUInt<32>(ImmVal))
+ return false;
+ ImmVals.push_back(ImmVal);
+ ImmVal = 0;
+ ImmSize = 0;
+ }
+ }
+
+ // Can only combine REG_SEQUENCE into one 64b immediate materialization mov.
+ if (DefRC == RC) {
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), MovDesc, Reg)
+ .addImm(ImmVals[0]);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (ImmVals.size() == 1)
+ return false;
+
+ // Can't bail from here on out: modifying the MI.
+
+ // Remove source operands.
+ for (unsigned i = MI.getNumOperands() - 1; i > 0; --i)
+ MI.removeOperand(i);
+
+ unsigned Ch = 0;
+ for (uint64_t Val : ImmVals) {
+ Register MovReg = MRI->createVirtualRegister(RC);
+ // Duplicate vmov imm materializations (e.g., splatted operands) should get
+ // combined by MachineCSE pass.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B64_PSEUDO), MovReg)
+ .addImm(Val);
+
+ // 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.).
+ unsigned SubReg64B =
+ SIRegisterInfo::getSubRegFromChannel(/*Channel=*/Ch * 2, /*SubRegs=*/2);
+
+ MI.addOperand(MachineOperand::CreateReg(MovReg, /*isDef=*/false));
+ MI.addOperand(MachineOperand::CreateImm(SubReg64B));
+ ++Ch;
+ }
----------------
ritter-x2a wrote:
Would this try to fold a 3*32-bit reg sequence into two 64-bit movs (which, I'd guess, would break in some way because of mismatched register types)? Or are unevenly-sized reg sequences impossible here?
https://github.com/llvm/llvm-project/pull/138843
More information about the llvm-commits
mailing list