[llvm] AMDGPU: Fold more scalar operations on frame index to VALU (PR #115059)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 09:13:02 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/115059
>From 222beefb3d21cf61b2c801a88be1375e2360f525 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 1 Nov 2024 12:24:37 -0700
Subject: [PATCH] AMDGPU: Fold more scalar operations on frame index to VALU
Further extend workaround for the lack of proper regbankselect
for frame indexes.
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 121 ++++++++-----
.../fold-operands-s-add-copy-to-vgpr.mir | 167 ++++++++++++++++--
2 files changed, 229 insertions(+), 59 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 096d0316e9dc08..73834773f66e3c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -78,9 +78,25 @@ class SIFoldOperandsImpl {
bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
const MachineOperand &OpToFold) const;
- /// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
- ///
- /// => %vgpr = V_ADD_U32 x, frameindex
+ // TODO: Just use TII::getVALUOp
+ unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
+ switch (Opc) {
+ case AMDGPU::S_ADD_I32: {
+ if (ST->hasAddNoCarry())
+ return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
+ return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+ }
+ case AMDGPU::S_OR_B32:
+ return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
+ case AMDGPU::S_AND_B32:
+ return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
+ case AMDGPU::S_MUL_I32:
+ return AMDGPU::V_MUL_LO_U32_e64;
+ default:
+ return AMDGPU::INSTRUCTION_LIST_END;
+ }
+ }
+
bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
MachineInstr &MI) const;
@@ -202,6 +218,8 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
const unsigned Opc = UseMI.getOpcode();
switch (Opc) {
case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_AND_B32:
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_CO_U32_e32:
// TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
@@ -238,53 +256,62 @@ bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
MRI->hasOneNonDBGUse(SrcReg)) {
MachineInstr *Def = MRI->getVRegDef(SrcReg);
- if (Def && Def->getOpcode() == AMDGPU::S_ADD_I32 &&
- Def->getOperand(3).isDead()) {
- MachineOperand *Src0 = &Def->getOperand(1);
- MachineOperand *Src1 = &Def->getOperand(2);
-
- // TODO: This is profitable with more operand types, and for more
- // opcodes. But ultimately this is working around poor / nonexistent
- // regbankselect.
- if (!Src0->isFI() && !Src1->isFI())
- return false;
+ if (!Def || Def->getNumOperands() != 4)
+ return false;
- if (Src0->isFI())
- std::swap(Src0, Src1);
-
- MachineBasicBlock *MBB = Def->getParent();
- const DebugLoc &DL = Def->getDebugLoc();
- if (ST->hasAddNoCarry()) {
- bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
- MachineInstrBuilder Add =
- BuildMI(*MBB, *Def, DL,
- TII->get(UseVOP3 ? AMDGPU::V_ADD_U32_e64
- : AMDGPU::V_ADD_U32_e32),
- DstReg)
- .add(*Src0)
- .add(*Src1)
- .setMIFlags(Def->getFlags());
- if (UseVOP3)
- Add.addImm(0);
-
- Def->eraseFromParent();
- MI.eraseFromParent();
- return true;
- }
+ MachineOperand *Src0 = &Def->getOperand(1);
+ MachineOperand *Src1 = &Def->getOperand(2);
- MachineBasicBlock::LivenessQueryResult Liveness =
- MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
- if (Liveness == MachineBasicBlock::LQR_Dead) {
- // TODO: If src1 satisfies operand constraints, use vop3 version.
- BuildMI(*MBB, *Def, DL, TII->get(AMDGPU::V_ADD_CO_U32_e32), DstReg)
- .add(*Src0)
- .add(*Src1)
- .setOperandDead(3) // implicit-def $vcc
- .setMIFlags(Def->getFlags());
- Def->eraseFromParent();
- MI.eraseFromParent();
- return true;
+ // TODO: This is profitable with more operand types, and for more
+ // opcodes. But ultimately this is working around poor / nonexistent
+ // regbankselect.
+ if (!Src0->isFI() && !Src1->isFI())
+ return false;
+
+ if (Src0->isFI())
+ std::swap(Src0, Src1);
+
+ const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
+ unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
+ if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
+ !Def->getOperand(3).isDead()) // Check if scc is dead
+ return false;
+
+ MachineBasicBlock *MBB = Def->getParent();
+ const DebugLoc &DL = Def->getDebugLoc();
+ if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
+ MachineInstrBuilder Add =
+ BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
+
+ if (Add->getDesc().getNumDefs() == 2) {
+ Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ Add.addDef(CarryOutReg, RegState::Dead);
+ MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
}
+
+ Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
+ if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
+ Add.addImm(0);
+
+ Def->eraseFromParent();
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
+
+ MachineBasicBlock::LivenessQueryResult Liveness =
+ MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
+ if (Liveness == MachineBasicBlock::LQR_Dead) {
+ // TODO: If src1 satisfies operand constraints, use vop3 version.
+ BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
+ .add(*Src0)
+ .add(*Src1)
+ .setOperandDead(3) // implicit-def $vcc
+ .setMIFlags(Def->getFlags());
+ Def->eraseFromParent();
+ MI.eraseFromParent();
+ return true;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
index 683f02b413315e..8c88c7a97174e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
@@ -75,8 +75,8 @@ stack:
body: |
bb.0:
; GFX8-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr
- ; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
+ ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
;
; GFX9-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr
; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
@@ -98,8 +98,8 @@ stack:
body: |
bb.0:
; GFX8-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr
- ; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
+ ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
;
; GFX9-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr
; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
@@ -202,8 +202,8 @@ body: |
; GFX8: liveins: $sgpr8
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
- ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+ ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+ ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
;
; GFX9-LABEL: name: fold_s_add_i32__mov_fi_reg_copy_to_virt_vgpr
; GFX9: liveins: $sgpr8
@@ -239,8 +239,8 @@ body: |
; GFX8: liveins: $sgpr8
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
- ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+ ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+ ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
;
; GFX9-LABEL: name: fold_s_add_i32__reg_copy_mov_fi_to_virt_vgpr
; GFX9: liveins: $sgpr8
@@ -337,8 +337,8 @@ body: |
; GFX8: liveins: $sgpr8
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
- ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+ ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+ ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
;
; GFX9-LABEL: name: fold_s_add_i32__fi_reg_copy_to_virt_vgpr
; GFX9: liveins: $sgpr8
@@ -371,8 +371,8 @@ body: |
; GFX8: liveins: $sgpr8
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
- ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
+ ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
+ ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
;
; GFX9-LABEL: name: fold_s_add_i32__reg_fi_copy_to_virt_vgpr
; GFX9: liveins: $sgpr8
@@ -392,3 +392,146 @@ body: |
%2:vgpr_32 = COPY %1
SI_RETURN implicit %2
...
+
+---
+name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
+ ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc
+ %2:vgpr_32 = COPY %1
+ SI_RETURN implicit %2
+...
+
+---
+name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
+ ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc
+ %2:vgpr_32 = COPY %1
+ SI_RETURN implicit %2
+...
+
+---
+name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
+ ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit %1
+ %0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc
+ %1:vgpr_32 = COPY %0
+ SI_RETURN implicit %1
+...
+
+---
+name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
+ ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit %1
+ %0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc
+ %1:vgpr_32 = COPY %0
+ SI_RETURN implicit %1
+...
+
+---
+name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr
+ ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 64, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e64_]]
+ %0:sreg_32 = S_AND_B32 %stack.0, 64, implicit-def dead $scc
+ %1:vgpr_32 = COPY %0
+ SI_RETURN implicit %1
+...
+
+---
+name: fold_s_and_b32__fi_const_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_and_b32__fi_const_copy_to_virt_vgpr
+ ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
+ %0:sreg_32 = S_AND_B32 %stack.0, 128, implicit-def dead $scc
+ %1:vgpr_32 = COPY %0
+ SI_RETURN implicit %1
+...
+
+---
+name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr
+ ; CHECK: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 64, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]]
+ %0:sreg_32 = S_MUL_I32 %stack.0, 64, implicit-def dead $scc
+ %1:vgpr_32 = COPY %0
+ SI_RETURN implicit %1
+...
+
+---
+name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ liveins: $sgpr4
+ ; CHECK-LABEL: name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr
+ ; CHECK: liveins: $sgpr4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]]
+ %0:sreg_32 = COPY $sgpr4
+ %1:sreg_32 = S_MUL_I32 %stack.0, %0, implicit-def dead $scc
+ %2:vgpr_32 = COPY %1
+ SI_RETURN implicit %2
+...
+
+---
+name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
+ ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc
+ %2:vgpr_32 = COPY %1
+ SI_RETURN implicit %2
+...
More information about the llvm-commits
mailing list