[llvm] 06f3079 - AMDGPU: More consistently use the fold list instead of direct mutation (#127612)

Mon Feb 24 21:02:06 PST 2025

Author: Matt Arsenault
Date: 2025-02-25T12:02:03+07:00
New Revision: 06f30792353a5d7bacc9110294db7cca49b4eafa

URL: https://github.com/llvm/llvm-project/commit/06f30792353a5d7bacc9110294db7cca49b4eafa
DIFF: https://github.com/llvm/llvm-project/commit/06f30792353a5d7bacc9110294db7cca49b4eafa.diff

LOG: AMDGPU: More consistently use the fold list instead of direct mutation (#127612)

There were 2 parallel fold check mechanisms, so consistently use the
fold list. The worklist management here is still not good. Other types
of folds are not using it, and we should probably rewrite the pass to
look more like peephole-opt.

This should be an alternative fix to skipping commute if the operands
are the same (#127562). The new test is still not broken as-is, but
demonstrates failures in a future patch.

Added: 
    llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 641365d319a50..dbed042776e2c 100644

--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -114,7 +114,7 @@ class SIFoldOperandsImpl {
   bool
   getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
                 Register UseReg, uint8_t OpTy) const;
-  bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
+  bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI,
                       unsigned UseOpIdx,
                       SmallVectorImpl<FoldCandidate> &FoldList) const;
   void foldOperand(MachineOperand &OpToFold,
@@ -575,11 +575,6 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
   return true;
 }
 
-static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
-                              const MachineInstr *MI) {
-  return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
-}
-
 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
                                 MachineInstr *MI, unsigned OpNo,
                                 MachineOperand *FoldOp, bool Commuted = false,
@@ -680,12 +675,6 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
       }
     }
 
-    // If we are already folding into another operand of MI, then
-    // we can't commute the instruction, otherwise we risk making the
-    // other fold illegal.
-    if (isUseMIInFoldList(FoldList, MI))
-      return false;
-
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
     unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
@@ -816,7 +805,7 @@ bool SIFoldOperandsImpl::getRegSeqInit(
 }
 
 bool SIFoldOperandsImpl::tryToFoldACImm(
-    const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
+    MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
     SmallVectorImpl<FoldCandidate> &FoldList) const {
   const MCInstrDesc &Desc = UseMI->getDesc();
   if (UseOpIdx >= Desc.getNumOperands())
@@ -827,7 +816,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
 
   uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
   if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
-    UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
+    appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
     return true;
   }
 
@@ -838,16 +827,13 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!UseReg.isVirtual())
     return false;
 
-  if (isUseMIInFoldList(FoldList, UseMI))
-    return false;
-
   // Maybe it is just a COPY of an immediate itself.
   MachineInstr *Def = MRI->getVRegDef(UseReg);
   MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
     MachineOperand &DefOp = Def->getOperand(1);
     if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
-      UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
+      appendFoldCandidate(FoldList, UseMI, UseOpIdx, &DefOp);
       return true;
     }
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir
new file mode 100644
index 0000000000000..18fbf3eaf9c8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=peephole-opt,si-fold-operands -o - %s | FileCheck %s
+
+# Check for assert when trying later folds after commuting an
+# instruction where both operands are the same register. This depended
+# on use list ordering, so we need to run peephole-opt first to
+# reproduce.
+
+---
+name:            commute_add_same_inputs_assert
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: commute_add_same_inputs_assert
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr4 = COPY [[V_ADDC_U32_e32_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr4
+    %0:vgpr_32 = COPY $vgpr0
+    %1:sreg_64 = S_MOV_B64 0
+    %2:sreg_32 = COPY %1.sub0
+    %3:vgpr_32 = V_ADD_CO_U32_e32 %2, killed %0, implicit-def $vcc, implicit $exec
+    %4:vgpr_32 = COPY %2
+    %5:vgpr_32 = COPY %2
+    %6:vgpr_32 = V_ADDC_U32_e32 %4, %5, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr4 = COPY %6
+    SI_RETURN implicit $vgpr4
+
+...
+
+---
+name:            commute_sub_same_inputs_assert
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: commute_sub_same_inputs_assert
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_SUBBREV_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBBREV_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr4 = COPY [[V_SUBBREV_U32_e32_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr4
+    %0:vgpr_32 = COPY $vgpr0
+    %1:sreg_64 = S_MOV_B64 0
+    %2:sreg_32 = COPY %1.sub0
+    %3:vgpr_32 = V_SUB_CO_U32_e32 %2, killed %0, implicit-def $vcc, implicit $exec
+    %4:vgpr_32 = COPY %2
+    %5:vgpr_32 = COPY %2
+    %6:vgpr_32 = V_SUBB_U32_e32 %5, %4, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr4 = COPY %6
+    SI_RETURN implicit $vgpr4
+
+...