[llvm] AMDGPU: Fold more scalar operations on frame index to VALU (PR #115059)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 09:13:02 PST 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/115059

>From 222beefb3d21cf61b2c801a88be1375e2360f525 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 1 Nov 2024 12:24:37 -0700
Subject: [PATCH] AMDGPU: Fold more scalar operations on frame index to VALU

Further extend workaround for the lack of proper regbankselect
for frame indexes.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     | 121 ++++++++-----
 .../fold-operands-s-add-copy-to-vgpr.mir      | 167 ++++++++++++++++--
 2 files changed, 229 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 096d0316e9dc08..73834773f66e3c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -78,9 +78,25 @@ class SIFoldOperandsImpl {
   bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
                          const MachineOperand &OpToFold) const;
 
-  /// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
-  ///
-  ///   => %vgpr = V_ADD_U32 x, frameindex
+  // TODO: Just use TII::getVALUOp
+  unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
+    switch (Opc) {
+    case AMDGPU::S_ADD_I32: {
+      if (ST->hasAddNoCarry())
+        return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
+      return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+    }
+    case AMDGPU::S_OR_B32:
+      return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
+    case AMDGPU::S_AND_B32:
+      return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
+    case AMDGPU::S_MUL_I32:
+      return AMDGPU::V_MUL_LO_U32_e64;
+    default:
+      return AMDGPU::INSTRUCTION_LIST_END;
+    }
+  }
+
   bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
                                              MachineInstr &MI) const;
 
@@ -202,6 +218,8 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
   const unsigned Opc = UseMI.getOpcode();
   switch (Opc) {
   case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_OR_B32:
+  case AMDGPU::S_AND_B32:
   case AMDGPU::V_ADD_U32_e32:
   case AMDGPU::V_ADD_CO_U32_e32:
     // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
@@ -238,53 +256,62 @@ bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
   if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
       MRI->hasOneNonDBGUse(SrcReg)) {
     MachineInstr *Def = MRI->getVRegDef(SrcReg);
-    if (Def && Def->getOpcode() == AMDGPU::S_ADD_I32 &&
-        Def->getOperand(3).isDead()) {
-      MachineOperand *Src0 = &Def->getOperand(1);
-      MachineOperand *Src1 = &Def->getOperand(2);
-
-      // TODO: This is profitable with more operand types, and for more
-      // opcodes. But ultimately this is working around poor / nonexistent
-      // regbankselect.
-      if (!Src0->isFI() && !Src1->isFI())
-        return false;
+    if (!Def || Def->getNumOperands() != 4)
+      return false;
 
-      if (Src0->isFI())
-        std::swap(Src0, Src1);
-
-      MachineBasicBlock *MBB = Def->getParent();
-      const DebugLoc &DL = Def->getDebugLoc();
-      if (ST->hasAddNoCarry()) {
-        bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
-        MachineInstrBuilder Add =
-            BuildMI(*MBB, *Def, DL,
-                    TII->get(UseVOP3 ? AMDGPU::V_ADD_U32_e64
-                                     : AMDGPU::V_ADD_U32_e32),
-                    DstReg)
-                .add(*Src0)
-                .add(*Src1)
-                .setMIFlags(Def->getFlags());
-        if (UseVOP3)
-          Add.addImm(0);
-
-        Def->eraseFromParent();
-        MI.eraseFromParent();
-        return true;
-      }
+    MachineOperand *Src0 = &Def->getOperand(1);
+    MachineOperand *Src1 = &Def->getOperand(2);
 
-      MachineBasicBlock::LivenessQueryResult Liveness =
-          MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
-      if (Liveness == MachineBasicBlock::LQR_Dead) {
-        // TODO: If src1 satisfies operand constraints, use vop3 version.
-        BuildMI(*MBB, *Def, DL, TII->get(AMDGPU::V_ADD_CO_U32_e32), DstReg)
-            .add(*Src0)
-            .add(*Src1)
-            .setOperandDead(3) // implicit-def $vcc
-            .setMIFlags(Def->getFlags());
-        Def->eraseFromParent();
-        MI.eraseFromParent();
-        return true;
+    // TODO: This is profitable with more operand types, and for more
+    // opcodes. But ultimately this is working around poor / nonexistent
+    // regbankselect.
+    if (!Src0->isFI() && !Src1->isFI())
+      return false;
+
+    if (Src0->isFI())
+      std::swap(Src0, Src1);
+
+    const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
+    unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
+    if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
+        !Def->getOperand(3).isDead()) // Check if scc is dead
+      return false;
+
+    MachineBasicBlock *MBB = Def->getParent();
+    const DebugLoc &DL = Def->getDebugLoc();
+    if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
+      MachineInstrBuilder Add =
+          BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
+
+      if (Add->getDesc().getNumDefs() == 2) {
+        Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
+        Add.addDef(CarryOutReg, RegState::Dead);
+        MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
       }
+
+      Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
+      if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
+        Add.addImm(0);
+
+      Def->eraseFromParent();
+      MI.eraseFromParent();
+      return true;
+    }
+
+    assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
+
+    MachineBasicBlock::LivenessQueryResult Liveness =
+        MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
+    if (Liveness == MachineBasicBlock::LQR_Dead) {
+      // TODO: If src1 satisfies operand constraints, use vop3 version.
+      BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
+          .add(*Src0)
+          .add(*Src1)
+          .setOperandDead(3) // implicit-def $vcc
+          .setMIFlags(Def->getFlags());
+      Def->eraseFromParent();
+      MI.eraseFromParent();
+      return true;
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
index 683f02b413315e..8c88c7a97174e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
@@ -75,8 +75,8 @@ stack:
 body:             |
   bb.0:
     ; GFX8-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr
-    ; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+    ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
+    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
     ;
     ; GFX9-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
@@ -98,8 +98,8 @@ stack:
 body:             |
   bb.0:
     ; GFX8-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr
-    ; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+    ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
+    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
     ;
     ; GFX9-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
@@ -202,8 +202,8 @@ body:             |
     ; GFX8: liveins: $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
     ;
     ; GFX9-LABEL: name: fold_s_add_i32__mov_fi_reg_copy_to_virt_vgpr
     ; GFX9: liveins: $sgpr8
@@ -239,8 +239,8 @@ body:             |
     ; GFX8: liveins: $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
     ;
     ; GFX9-LABEL: name: fold_s_add_i32__reg_copy_mov_fi_to_virt_vgpr
     ; GFX9: liveins: $sgpr8
@@ -337,8 +337,8 @@ body:             |
     ; GFX8: liveins: $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
     ;
     ; GFX9-LABEL: name: fold_s_add_i32__fi_reg_copy_to_virt_vgpr
     ; GFX9: liveins: $sgpr8
@@ -371,8 +371,8 @@ body:             |
     ; GFX8: liveins: $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+    ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
     ;
     ; GFX9-LABEL: name: fold_s_add_i32__reg_fi_copy_to_virt_vgpr
     ; GFX9: liveins: $sgpr8
@@ -392,3 +392,146 @@ body:             |
     %2:vgpr_32 = COPY %1
     SI_RETURN implicit %2
 ...
+
+---
+name:  fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
+    ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
+    %0:sreg_32 = S_MOV_B32 %stack.0
+    %1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc
+    %2:vgpr_32 = COPY %1
+    SI_RETURN implicit %2
+...
+
+---
+name:  fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
+    ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
+    %0:sreg_32 = S_MOV_B32 %stack.0
+    %1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc
+    %2:vgpr_32 = COPY %1
+    SI_RETURN implicit %2
+...
+
+---
+name:  fold_s_or_b32__fi_imm_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
+    ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit %1
+    %0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc
+    %1:vgpr_32 = COPY %0
+    SI_RETURN implicit %1
+...
+
+---
+name:  fold_s_or_b32__imm_fi_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
+    ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit %1
+    %0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc
+    %1:vgpr_32 = COPY %0
+    SI_RETURN implicit %1
+...
+
+---
+name:  fold_s_and_b32__fi_imm_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 64, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e64_]]
+    %0:sreg_32 = S_AND_B32 %stack.0, 64, implicit-def dead $scc
+    %1:vgpr_32 = COPY %0
+    SI_RETURN implicit %1
+...
+
+---
+name:  fold_s_and_b32__fi_const_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_and_b32__fi_const_copy_to_virt_vgpr
+    ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
+    %0:sreg_32 = S_AND_B32 %stack.0, 128, implicit-def dead $scc
+    %1:vgpr_32 = COPY %0
+    SI_RETURN implicit %1
+...
+
+---
+name:  fold_s_mul_i32__fi_imm_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr
+    ; CHECK: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 64, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]]
+    %0:sreg_32 = S_MUL_I32 %stack.0, 64, implicit-def dead $scc
+    %1:vgpr_32 = COPY %0
+    SI_RETURN implicit %1
+...
+
+---
+name:  fold_s_mul_i32__fi_reg_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+      liveins: $sgpr4
+    ; CHECK-LABEL: name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr
+    ; CHECK: liveins: $sgpr4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]]
+    %0:sreg_32 = COPY $sgpr4
+    %1:sreg_32 = S_MUL_I32 %stack.0, %0, implicit-def dead $scc
+    %2:vgpr_32 = COPY %1
+    SI_RETURN implicit %2
+...
+
+---
+name:  fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
+    ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
+    ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
+    %0:sreg_32 = S_MOV_B32 %stack.0
+    %1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc
+    %2:vgpr_32 = COPY %1
+    SI_RETURN implicit %2
+...



More information about the llvm-commits mailing list