[llvm] Co-issue packed instructions by unpacking (PR #151704)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 11:48:13 PDT 2025
================
@@ -225,6 +254,712 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
return true;
}
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
+ MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_F16:
+ case AMDGPU::V_PK_ADD_F16:
+ case AMDGPU::V_PK_FMA_F32:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
+ unsigned Opcode = I.getOpcode();
+ // use 64 bit encoding to allow use of VOP3 instructions.
+ // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+ // e32 instructions are VOP2 and don't allow source modifiers
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::V_PK_MUL_F32:
+ return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::V_PK_ADD_F16:
+ return AMDGPU::V_ADD_F16_e64;
+ case AMDGPU::V_PK_MUL_F16:
+ return AMDGPU::V_MUL_F16_e64;
+ case AMDGPU::V_PK_FMA_F32:
+ return AMDGPU::V_FMA_F32_e64;
+ default:
+ return std::numeric_limits<uint16_t>::max();
+ }
+}
+
+bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
+ MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles) {
+ auto *BB = BeginMI.getParent();
+ auto *MF = BB->getParent();
+ int NumInst = 0;
+
+ auto E = BB->end();
+
+ int TotalCyclesBetweenCandidates = 0;
+ auto SchedModel = TII->getSchedModel();
+ for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+ MachineInstr &Instr = *I;
+ const MCSchedClassDesc *InstrSchedClassDesc =
+ SchedModel.resolveSchedClass(&Instr);
+ TotalCyclesBetweenCandidates +=
+ SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+ if (Instr.isMetaInstruction())
+ continue;
+
+ if (Instr.isTerminator())
+ return false;
+
+ if (TotalCyclesBetweenCandidates > NumMFMACycles)
+ return false;
+
+ if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+ if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) ||
+ (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+ // unpacking packed F16 instructions requires multiple instructions.
+ // Instructions are issued to extract lower and higher bits for each
+ // operand Instructions are then issued for 2 unpacked instructions, and
+ // additional instructions to put them back into the original
+ // destination register The following sequence of instructions are
+ // issued
+
+ // The next two are needed to move masks into vgprs. Ideally, immediates
+ // should be used. However, if one of the source operands are
+ // sgpr/sregs, then immediates are not allowed. Hence, the need to move
+ // these into vgprs
+
+ // vgpr_32 = V_MOV_B32_e32 65535
+ // vgpr_32 = V_MOV_B32_e32 16
+
+ // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32
+ // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64
+ // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32
+ // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32
+ // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+ // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+ // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
+ // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
+
+ // we need to issue the MOV instructions above only once. Once these are
+ // issued, the IsF16MaskSet flag is set subsequent unpacking only needs
+ // to issue the remaining instructions The number of latency cycles for
+ // each instruction above is 1. It's hard coded into the code to reduce
+ // code complexity.
+ if (IsF16MaskSet)
+ TotalCyclesBetweenCandidates += 7;
+ else
+ TotalCyclesBetweenCandidates += 9;
+ } else
+ TotalCyclesBetweenCandidates += 1;
+
+ if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
----------------
jrbyrnes wrote:
Tie between the cyclecount should go to the packed version I think
https://github.com/llvm/llvm-project/pull/151704
More information about the llvm-commits
mailing list