[llvm] r280584 - AMDGPU: Fix spilling of m0
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 2 23:57:56 PDT 2016
Author: arsenm
Date: Sat Sep 3 01:57:55 2016
New Revision: 280584
URL: http://llvm.org/viewvc/llvm-project?rev=280584&view=rev
Log:
AMDGPU: Fix spilling of m0
readlane/writelane do not support using m0 as the output/input.
Constrain the register class of spill vregs to try to avoid this,
but also handle spilling of the physreg when necessary by inserting
an additional copy to a normal SGPR.
Added:
llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
Removed:
llvm/trunk/test/CodeGen/AMDGPU/m0-spill.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=280584&r1=280583&r2=280584&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Sat Sep 3 01:57:55 2016
@@ -587,17 +587,18 @@ void SIInstrInfo::storeRegToStackSlot(Ma
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
+ // We are only allowed to create one new instruction when spilling
+ // registers, so we need to use pseudo instruction for spilling SGPRs.
+ const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+
+ // The SGPR spill/restore instructions only work on number sgprs, so we need
+ // to make sure we are using the correct register class.
if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
- // m0 may not be allowed for readlane.
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
- // We are only allowed to create one new instruction when spilling
- // registers, so we need to use pseudo instruction for spilling
- // SGPRs.
- unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
- BuildMI(MBB, MI, DL, get(Opcode))
+ BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // src
.addFrameIndex(FrameIndex) // frame_idx
.addMemOperand(MMO);
@@ -621,10 +622,10 @@ void SIInstrInfo::storeRegToStackSlot(Ma
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // src
- .addFrameIndex(FrameIndex) // frame_idx
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
- .addImm(0) // offset
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(0) // offset
.addMemOperand(MMO);
}
@@ -685,15 +686,13 @@ void SIInstrInfo::loadRegFromStackSlot(M
if (RI.isSGPRClass(RC)) {
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
- unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
-
+ const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
- // m0 may not be allowed for readlane.
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // frame_idx
.addMemOperand(MMO);
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=280584&r1=280583&r2=280584&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Sat Sep 3 01:57:55 2016
@@ -1352,10 +1352,11 @@ multiclass SI_SPILL_SGPR <RegisterClass
} // End UseNamedOperandTable = 1
}
-// It's unclear whether you can use M0 as the output of v_readlane_b32
-// instructions, so use SReg_32_XM0 register class for spills to prevent
-// this from happening.
-defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32_XM0>;
+// You cannot use M0 as the output of v_readlane_b32 instructions or
+// use it in the sdata operand of SMEM instructions. We still need to
+// be able to spill the physical register m0, so allow it for
+// SI_SPILL_32_* instructions.
+defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=280584&r1=280583&r2=280584&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Sat Sep 3 01:57:55 2016
@@ -540,9 +540,9 @@ void SIRegisterInfo::eliminateFrameIndex
case AMDGPU::SI_SPILL_S32_SAVE: {
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
unsigned SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
@@ -551,8 +551,19 @@ void SIRegisterInfo::eliminateFrameIndex
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
-
if (Spill.hasReg()) {
+ if (SuperReg == AMDGPU::M0) {
+ assert(NumSubRegs == 1);
+ unsigned CopyM0
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0)
+ .addReg(SuperReg, getKillRegState(IsKill));
+
+ // The real spill now kills the temp copy.
+ SubReg = SuperReg = CopyM0;
+ IsKill = true;
+ }
+
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
@@ -611,6 +622,14 @@ void SIRegisterInfo::eliminateFrameIndex
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned SuperReg = MI->getOperand(0).getReg();
+ // m0 is not allowed as with readlane/writelane, so a temporary SGPR and
+ // extra copy is needed.
+ bool IsM0 = (SuperReg == AMDGPU::M0);
+ if (IsM0) {
+ assert(NumSubRegs == 1);
+ SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ }
+
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
@@ -651,6 +670,11 @@ void SIRegisterInfo::eliminateFrameIndex
}
}
+ if (IsM0 && SuperReg != AMDGPU::M0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(SuperReg);
+ }
+
MI->eraseFromParent();
break;
}
Removed: llvm/trunk/test/CodeGen/AMDGPU/m0-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/m0-spill.ll?rev=280583&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/m0-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/m0-spill.ll (removed)
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
- at lds = external addrspace(3) global [64 x float]
-
-; CHECK-LABEL: {{^}}main:
-; CHECK-NOT: v_readlane_b32 m0
-define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
-main_body:
- %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
- %cmp = fcmp ueq float 0.0, %4
- br i1 %cmp, label %if, label %else
-
-if:
- %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
- %lds_data = load float, float addrspace(3)* %lds_ptr
- br label %endif
-
-else:
- %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
- br label %endif
-
-endif:
- %export = phi float [%lds_data, %if], [%interp, %else]
- %5 = call i32 @llvm.SI.packf16(float %export, float %export)
- %6 = bitcast i32 %5 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
- ret void
-}
-
-declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
-
-declare i32 @llvm.SI.packf16(float, float) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
Added: llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll?rev=280584&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll Sat Sep 3 01:57:55 2016
@@ -0,0 +1,78 @@
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+
+; XXX - Why does it like to use vcc?
+
+; GCN-LABEL: {{^}}spill_m0:
+; GCN: s_cmp_lg_i32
+
+; TOVGPR: s_mov_b32 vcc_hi, m0
+; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], vcc_hi, 0
+
+; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
+; TOVMEM: s_waitcnt vmcnt(0)
+; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: [[ENDIF]]:
+; TOVGPR: v_readlane_b32 vcc_hi, [[SPILL_VREG]], 0
+; TOVGPR: s_mov_b32 m0, vcc_hi
+
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload
+; TOVMEM: s_waitcnt vmcnt(0)
+; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]]
+; TOVMEM: s_mov_b32 m0, vcc_hi
+
+; GCN: s_add_i32 m0, m0, 1
+define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
+entry:
+ %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
+ %cmp0 = icmp eq i32 %cond, 0
+ br i1 %cmp0, label %if, label %endif
+
+if:
+ call void asm sideeffect "v_nop", ""() #0
+ br label %endif
+
+endif:
+ %foo = call i32 asm sideeffect "s_add_i32 $0, $1, 1", "=s,{M0}"(i32 %m0) #0
+ store i32 %foo, i32 addrspace(1)* %out
+ ret void
+}
+
+ at lds = internal addrspace(3) global [64 x float] undef
+
+; GCN-LABEL: {{^}}spill_m0_lds:
+; GCN-NOT: v_readlane_b32 m0
+define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 {
+main_body:
+ %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+ %cmp = fcmp ueq float 0.0, %4
+ br i1 %cmp, label %if, label %else
+
+if:
+ %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
+ %lds_data = load float, float addrspace(3)* %lds_ptr
+ br label %endif
+
+else:
+ %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+ br label %endif
+
+endif:
+ %export = phi float [%lds_data, %if], [%interp, %else]
+ %5 = call i32 @llvm.SI.packf16(float %export, float %export)
+ %6 = bitcast i32 %5 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
+ ret void
+}
+
+declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
+
+declare i32 @llvm.SI.packf16(float, float) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list