[llvm] AMDGPU: Move reg_sequence splat handling (PR #140313)

Fri May 16 14:45:00 PDT 2025

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/140313

This code clunkily tried to find a splat reg_sequence by
looking at every use of the reg_sequence, and then looking
back at the reg_sequence to see if it's a splat. Extract this
into a separate helper function to help clean this up. This now
parses whether the reg_sequence forms a splat once, and defers the
legal inline immediate check to the use check (which is really use
context dependent)

The one regression is in globalisel, which has an extra
copy that should have been separately folded out. It was getting
dealt with by the handling of foldable copies in tryToFoldACImm.

This is preparation for #139317

>From f2dc1401f2e3f4588ee5061ed2f93da29996666c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 16 May 2025 18:53:33 +0200
Subject: [PATCH] AMDGPU: Move reg_sequence splat handling

This code clunkily tried to find a splat reg_sequence by
looking at every use of the reg_sequence, and then looking
back at the reg_sequence to see if it's a splat. Extract this
into a separate helper function to help clean this up. This now
parses whether the reg_sequence forms a splat once, and defers the
legal inline immediate check to the use check (which is really use
context dependent)

The one regression is in globalisel, which has an extra
copy that should have been separately folded out. It was getting
dealt with by the handling of foldable copies in tryToFoldACImm.

This is preparation for #139317
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 155 +++++++++++++++-------
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll   |   4 +-
 2 files changed, 112 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 633a858df364e..92937e33fd500 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -119,9 +119,22 @@ class SIFoldOperandsImpl {
                         MachineOperand *OpToFold) const;
   bool isUseSafeToFold(const MachineInstr &MI,
                        const MachineOperand &UseMO) const;
-  bool
+
+  const TargetRegisterClass *getRegSeqInit(
+      MachineInstr &RegSeq,
+      SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
+
+  const TargetRegisterClass *
   getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
-                Register UseReg, uint8_t OpTy) const;
+                Register UseReg) const;
+
+  std::pair<MachineOperand *, const TargetRegisterClass *>
+  isRegSeqSplat(MachineInstr &RegSeg) const;
+
+  MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
+                                     MachineOperand *SplatVal,
+                                     const TargetRegisterClass *SplatRC) const;
+
   bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI,
                       unsigned UseOpIdx,
                       SmallVectorImpl<FoldCandidate> &FoldList) const;
@@ -825,19 +838,24 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
   return Sub;
 }
 
-// Find a def of the UseReg, check if it is a reg_sequence and find initializers
-// for each subreg, tracking it to foldable inline immediate if possible.
-// Returns true on success.
-bool SIFoldOperandsImpl::getRegSeqInit(
-    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
-    Register UseReg, uint8_t OpTy) const {
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (!Def || !Def->isRegSequence())
-    return false;
+const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
+    MachineInstr &RegSeq,
+    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
 
-  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I != E; I += 2) {
-    MachineOperand &SrcOp = Def->getOperand(I);
-    unsigned SubRegIdx = Def->getOperand(I + 1).getImm();
+  assert(RegSeq.isRegSequence());
+
+  const TargetRegisterClass *RC = nullptr;
+
+  for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
+    MachineOperand &SrcOp = RegSeq.getOperand(I);
+    unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
+
+    // Only accept reg_sequence with uniform reg class inputs for simplicity.
+    const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
+    if (!RC)
+      RC = OpRC;
+    else if (!TRI->getCommonSubClass(RC, OpRC))
+      return nullptr;
 
     if (SrcOp.getSubReg()) {
       // TODO: Handle subregister compose
@@ -846,8 +864,7 @@ bool SIFoldOperandsImpl::getRegSeqInit(
     }
 
     MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
-    if (DefSrc && (DefSrc->isReg() ||
-                   (DefSrc->isImm() && TII->isInlineConstant(*DefSrc, OpTy)))) {
+    if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
       Defs.emplace_back(DefSrc, SubRegIdx);
       continue;
     }
@@ -855,7 +872,65 @@ bool SIFoldOperandsImpl::getRegSeqInit(
     Defs.emplace_back(&SrcOp, SubRegIdx);
   }
 
-  return true;
+  return RC;
+}
+
+// Find a def of the UseReg, check if it is a reg_sequence and find initializers
+// for each subreg, tracking it to an immediate if possible. Returns the
+// register class of the inputs on success.
+const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
+    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
+    Register UseReg) const {
+  MachineInstr *Def = MRI->getVRegDef(UseReg);
+  if (!Def || !Def->isRegSequence())
+    return nullptr;
+
+  return getRegSeqInit(*Def, Defs);
+}
+
+std::pair<MachineOperand *, const TargetRegisterClass *>
+SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
+  SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
+  const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
+  if (!SrcRC)
+    return {};
+
+  int64_t Imm;
+  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
+    const MachineOperand *Op = Defs[I].first;
+    if (!Op->isImm())
+      return {};
+
+    int64_t SubImm = Op->getImm();
+    if (!I) {
+      Imm = SubImm;
+      continue;
+    }
+    if (Imm != SubImm)
+      return {}; // Can only fold splat constants
+  }
+
+  return {Defs[0].first, SrcRC};
+}
+
+MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
+    MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal,
+    const TargetRegisterClass *SplatRC) const {
+  const MCInstrDesc &Desc = UseMI->getDesc();
+  if (UseOpIdx >= Desc.getNumOperands())
+    return nullptr;
+
+  // Filter out unhandled pseudos.
+  if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
+    return nullptr;
+
+  // FIXME: Verify SplatRC is compatible with the use operand
+  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
+  if (!TII->isInlineConstant(*SplatVal, OpTy) ||
+      !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
+    return nullptr;
+
+  return SplatVal;
 }
 
 bool SIFoldOperandsImpl::tryToFoldACImm(
@@ -869,7 +944,6 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return false;
 
-  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
   MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm()) {
     if (unsigned UseSubReg = UseOp.getSubReg()) {
@@ -916,31 +990,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
     }
   }
 
-  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
-  if (!getRegSeqInit(Defs, UseReg, OpTy))
-    return false;
-
-  int32_t Imm;
-  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
-    const MachineOperand *Op = Defs[I].first;
-    if (!Op->isImm())
-      return false;
-
-    auto SubImm = Op->getImm();
-    if (!I) {
-      Imm = SubImm;
-      if (!TII->isInlineConstant(*Op, OpTy) ||
-          !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
-        return false;
-
-      continue;
-    }
-    if (Imm != SubImm)
-      return false; // Can only fold splat constants
-  }
-
-  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
-  return true;
+  return false;
 }
 
 void SIFoldOperandsImpl::foldOperand(
@@ -970,14 +1020,26 @@ void SIFoldOperandsImpl::foldOperand(
     Register RegSeqDstReg = UseMI->getOperand(0).getReg();
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
+    MachineOperand *SplatVal;
+    const TargetRegisterClass *SplatRC;
+    std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
+
     // Grab the use operands first
     SmallVector<MachineOperand *, 4> UsesToProcess(
         llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
     for (auto *RSUse : UsesToProcess) {
       MachineInstr *RSUseMI = RSUse->getParent();
+      unsigned OpNo = RSUseMI->getOperandNo(RSUse);
 
-      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
-                         RSUseMI->getOperandNo(RSUse), FoldList))
+      if (SplatVal) {
+        if (MachineOperand *Foldable =
+                tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
+          appendFoldCandidate(FoldList, RSUseMI, OpNo, Foldable);
+          continue;
+        }
+      }
+
+      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
         continue;
 
       if (RSUse->getSubReg() != RegSeqDstSubReg)
@@ -986,6 +1048,7 @@ void SIFoldOperandsImpl::foldOperand(
       foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
                   CopiesToReplace);
     }
+
     return;
   }
 
@@ -2137,7 +2200,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
     return false;
 
   SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
-  if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
+  if (!getRegSeqInit(Defs, Reg))
     return false;
 
   for (auto &[Op, SubIdx] : Defs) {
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index ecd1abce67571..ddc3e770767b8 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1680,10 +1680,12 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0
+; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()