[llvm] AMDGPU: Fix using illegal VOP3 literal in frame index elimination (PR #115747)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 11 09:51:59 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Most of the change is just moving the default case outside
of the switch so we can break to the default handling for arbitrary
operations.
---
Patch is 77.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115747.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+361-354)
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir (+528-10)
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir (+58)
- (modified) llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll (+19)
- (modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (+4-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f76d1266f495cf..246ef7ad481ab7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2268,7 +2268,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
@@ -2496,6 +2496,25 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Register MaterializedReg = FrameReg;
Register ScavengedVGPR;
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ // For the non-immediate case, we could fall through to the default
+ // handling, but we do an in-place update of the result register here to
+ // avoid scavenging another register.
+ if (OtherOp->isImm()) {
+ int64_t TotalOffset = OtherOp->getImm() + Offset;
+
+ if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
+ !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
+ // If we can't support a VOP3 literal in the VALU instruction, we
+ // can't specially fold into the add.
+ // TODO: Handle VOP3->VOP2 shrink to support the fold.
+ break;
+ }
+
+ OtherOp->setImm(TotalOffset);
+ Offset = 0;
+ }
+
if (FrameReg && !ST.enableFlatScratch()) {
// We should just do an in-place update of the result register. However,
// the value there may also be used by the add, in which case we need a
@@ -2516,15 +2535,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MaterializedReg = ScavengedVGPR;
}
- int64_t Offset = FrameInfo.getObjectOffset(Index);
- // For the non-immediate case, we could fall through to the default
- // handling, but we do an in-place update of the result register here to
- // avoid scavenging another register.
- if (OtherOp->isImm()) {
- OtherOp->setImm(OtherOp->getImm() + Offset);
- Offset = 0;
- }
-
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
if (ST.enableFlatScratch() &&
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
@@ -2761,411 +2771,408 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
return true;
}
default: {
- // Other access to frame index
- const DebugLoc &DL = MI->getDebugLoc();
+ break;
+ }
+ }
- int64_t Offset = FrameInfo.getObjectOffset(Index);
- if (ST.enableFlatScratch()) {
- if (TII->isFLATScratch(*MI)) {
- assert((int16_t)FIOperandNum ==
- AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::saddr));
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ if (ST.enableFlatScratch()) {
+ if (TII->isFLATScratch(*MI)) {
+ assert(
+ (int16_t)FIOperandNum ==
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
- // The offset is always swizzled, just replace it
- if (FrameReg)
- FIOp->ChangeToRegister(FrameReg, false);
+ // The offset is always swizzled, just replace it
+ if (FrameReg)
+ FIOp->ChangeToRegister(FrameReg, false);
- MachineOperand *OffsetOp =
+ MachineOperand *OffsetOp =
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
- int64_t NewOffset = Offset + OffsetOp->getImm();
- if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
- SIInstrFlags::FlatScratch)) {
- OffsetOp->setImm(NewOffset);
- if (FrameReg)
- return false;
- Offset = 0;
- }
+ int64_t NewOffset = Offset + OffsetOp->getImm();
+ if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
+ OffsetOp->setImm(NewOffset);
+ if (FrameReg)
+ return false;
+ Offset = 0;
+ }
- if (!Offset) {
- unsigned Opc = MI->getOpcode();
- int NewOpc = -1;
- if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
- NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
- } else if (ST.hasFlatScratchSTMode()) {
- // On GFX10 we have ST mode to use no registers for an address.
- // Otherwise we need to materialize 0 into an SGPR.
- NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
- }
+ if (!Offset) {
+ unsigned Opc = MI->getOpcode();
+ int NewOpc = -1;
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
+ NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
+ } else if (ST.hasFlatScratchSTMode()) {
+ // On GFX10 we have ST mode to use no registers for an address.
+ // Otherwise we need to materialize 0 into an SGPR.
+ NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
+ }
- if (NewOpc != -1) {
- // removeOperand doesn't fixup tied operand indexes as it goes, so
- // it asserts. Untie vdst_in for now and retie them afterwards.
- int VDstIn = AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::vdst_in);
- bool TiedVDst = VDstIn != -1 &&
- MI->getOperand(VDstIn).isReg() &&
- MI->getOperand(VDstIn).isTied();
- if (TiedVDst)
- MI->untieRegOperand(VDstIn);
-
- MI->removeOperand(
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
-
- if (TiedVDst) {
- int NewVDst =
- AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
- int NewVDstIn =
- AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
- assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
- MI->tieOperands(NewVDst, NewVDstIn);
- }
- MI->setDesc(TII->get(NewOpc));
- return false;
+ if (NewOpc != -1) {
+ // removeOperand doesn't fixup tied operand indexes as it goes, so
+ // it asserts. Untie vdst_in for now and retie them afterwards.
+ int VDstIn =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+ bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
+ MI->getOperand(VDstIn).isTied();
+ if (TiedVDst)
+ MI->untieRegOperand(VDstIn);
+
+ MI->removeOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
+
+ if (TiedVDst) {
+ int NewVDst =
+ AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
+ int NewVDstIn =
+ AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
+ assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
+ MI->tieOperands(NewVDst, NewVDstIn);
}
+ MI->setDesc(TII->get(NewOpc));
+ return false;
}
}
+ }
- if (!FrameReg) {
- FIOp->ChangeToImmediate(Offset);
- if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
- return false;
- }
+ if (!FrameReg) {
+ FIOp->ChangeToImmediate(Offset);
+ if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
+ return false;
+ }
- // We need to use register here. Check if we can use an SGPR or need
- // a VGPR.
- FIOp->ChangeToRegister(AMDGPU::M0, false);
- bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
+ // We need to use register here. Check if we can use an SGPR or need
+ // a VGPR.
+ FIOp->ChangeToRegister(AMDGPU::M0, false);
+ bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
- if (!Offset && FrameReg && UseSGPR) {
- FIOp->setReg(FrameReg);
- return false;
- }
+ if (!Offset && FrameReg && UseSGPR) {
+ FIOp->setReg(FrameReg);
+ return false;
+ }
- const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
- : &AMDGPU::VGPR_32RegClass;
+ const TargetRegisterClass *RC =
+ UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
- Register TmpReg =
- RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
- FIOp->setReg(TmpReg);
- FIOp->setIsKill();
+ Register TmpReg =
+ RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
+ FIOp->setReg(TmpReg);
+ FIOp->setIsKill();
- if ((!FrameReg || !Offset) && TmpReg) {
- unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
- auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
- if (FrameReg)
- MIB.addReg(FrameReg);
- else
- MIB.addImm(Offset);
+ if ((!FrameReg || !Offset) && TmpReg) {
+ unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+ auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
+ if (FrameReg)
+ MIB.addReg(FrameReg);
+ else
+ MIB.addImm(Offset);
- return false;
- }
+ return false;
+ }
- bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
- !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
+ bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
+ !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
- Register TmpSReg =
- UseSGPR ? TmpReg
- : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
- MI, false, 0, !UseSGPR);
+ Register TmpSReg =
+ UseSGPR ? TmpReg
+ : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+ MI, false, 0, !UseSGPR);
- // TODO: for flat scratch another attempt can be made with a VGPR index
- // if no SGPRs can be scavenged.
- if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
- report_fatal_error("Cannot scavenge register in FI elimination!");
+ // TODO: for flat scratch another attempt can be made with a VGPR index
+ // if no SGPRs can be scavenged.
+ if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+ report_fatal_error("Cannot scavenge register in FI elimination!");
- if (!TmpSReg) {
- // Use frame register and restore it after.
- TmpSReg = FrameReg;
- FIOp->setReg(FrameReg);
- FIOp->setIsKill(false);
- }
+ if (!TmpSReg) {
+ // Use frame register and restore it after.
+ TmpSReg = FrameReg;
+ FIOp->setReg(FrameReg);
+ FIOp->setIsKill(false);
+ }
- if (NeedSaveSCC) {
- assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
- .addReg(FrameReg)
- .addImm(Offset);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
- .addReg(TmpSReg)
- .addImm(0);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
+ if (NeedSaveSCC) {
+ assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
+ .addReg(FrameReg)
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
+ .addReg(TmpSReg)
+ .addImm(0);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
+ .addImm(0)
+ .addReg(TmpSReg);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
+ .addReg(FrameReg)
+ .addImm(Offset);
+ }
+
+ if (!UseSGPR)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addReg(TmpSReg, RegState::Kill);
+
+ if (TmpSReg == FrameReg) {
+ // Undo frame register modification.
+ if (NeedSaveSCC &&
+ !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
+ MachineBasicBlock::iterator I =
+ BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
+ TmpSReg)
+ .addReg(FrameReg)
+ .addImm(-Offset);
+ I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
+ .addReg(TmpSReg)
+ .addImm(0);
+ BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
+ TmpSReg)
.addImm(0)
.addReg(TmpSReg);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
+ BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
+ FrameReg)
.addReg(FrameReg)
- .addImm(Offset);
+ .addImm(-Offset);
}
+ }
- if (!UseSGPR)
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
- .addReg(TmpSReg, RegState::Kill);
-
- if (TmpSReg == FrameReg) {
- // Undo frame register modification.
- if (NeedSaveSCC &&
- !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
- MachineBasicBlock::iterator I =
- BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
- TmpSReg)
- .addReg(FrameReg)
- .addImm(-Offset);
- I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
- .addReg(TmpSReg)
- .addImm(0);
- BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
- TmpSReg)
- .addImm(0)
- .addReg(TmpSReg);
- } else {
- BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
- FrameReg)
- .addReg(FrameReg)
- .addImm(-Offset);
- }
- }
+ return false;
+ }
- return false;
- }
+ bool IsMUBUF = TII->isMUBUF(*MI);
+
+ if (!IsMUBUF && !MFI->isBottomOfStack()) {
+ // Convert to a swizzled stack address by scaling by the wave size.
+ // In an entry function/kernel the offset is already swizzled.
+ bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+ bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
+ !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
+ const TargetRegisterClass *RC = IsSALU && !LiveSCC
+ ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::VGPR_32RegClass;
+ bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
+ MI->getOpcode() == AMDGPU::S_MOV_B32;
+ Register ResultReg =
+ IsCopy ? MI->getOperand(0).getReg()
+ : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
- bool IsMUBUF = TII->isMUBUF(*MI);
-
- if (!IsMUBUF && !MFI->isBottomOfStack()) {
- // Convert to a swizzled stack address by scaling by the wave size.
- // In an entry function/kernel the offset is already swizzled.
- bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
- bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
- !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
- const TargetRegisterClass *RC = IsSALU && !LiveSCC
- ? &AMDGPU::SReg_32RegClass
- : &AMDGPU::VGPR_32RegClass;
- bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
- MI->getOpcode() == AMDGPU::S_MOV_B32;
- Register ResultReg =
- IsCopy ? MI->getOperand(0).getReg()
- : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
-
- int64_t Offset = FrameInfo.getObjectOffset(Index);
- if (Offset == 0) {
- unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
- : AMDGPU::V_LSHRREV_B32_e64;
- Register TmpResultReg = ResultReg;
- if (IsSALU && LiveSCC) {
- TmpResultReg = RS->scavengeRegisterBackwards(
- AMDGPU::VGPR_32RegClass, MI, false, 0);
- }
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ if (Offset == 0) {
+ unsigned OpCode =
+ IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
+ Register TmpResultReg = ResultReg;
+ if (IsSALU && LiveSCC) {
+ TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
+ MI, false, 0);
+ }
- auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
- if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
- // For V_LSHRREV, the operands are reversed (the shift count goes
- // first).
- Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
- else
- Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
- if (IsSALU && !LiveSCC)
- Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
- if (IsSALU && LiveSCC) {
- Register NewDest =
- IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
- Shift, false, 0);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- NewDest)
- .addReg(TmpResultReg);
- ResultReg = NewDest;
- }
- } else {
- MachineInstrBuilder MIB;
- if (!IsSALU) {
- if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
- nullptr) {
- // Reuse ResultReg in intermediate step.
- Register ScaledReg = ResultReg;
-
- BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
- ScaledReg)
+ auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
+ if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
+ // For V_LSHRREV, the operands are reversed (the shift count goes
+ // first).
+ Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
+ else
+ Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
+ if (IsSALU && !LiveSCC)
+ Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
+ if (IsSALU && LiveSCC) {
+ Register NewDest =
+ IsCopy ? ResultReg
+ : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
+ Shift, false, 0);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
+ .addReg(TmpResultReg);
+ ResultReg = NewDest;
+ }
+ } else {
+ MachineInstrBuilder MIB;
+ if (!IsSALU) {
+ if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
+ nullptr) {
+ // Reuse ResultReg in intermediate step.
+ Register ScaledReg = ResultReg;
+
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/115747
More information about the llvm-commits
mailing list