[llvm] r328831 - AMDGPU: Support realigning stack

Thu Mar 29 14:30:06 PDT 2018

Author: arsenm
Date: Thu Mar 29 14:30:06 2018
New Revision: 328831

URL: http://llvm.org/viewvc/llvm-project?rev=328831&view=rev
Log:
AMDGPU: Support realigning stack

While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.

I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=328831&r1=328830&r2=328831&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp Thu Mar 29 14:30:06 2018
@@ -587,6 +587,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo
 
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
+  if (MFI->isStackRealigned())
+    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
 
 
   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||

Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=328831&r1=328830&r2=328831&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Thu Mar 29 14:30:06 2018
@@ -13,6 +13,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -487,9 +488,43 @@ void SIFrameLowering::emitEntryFunctionS
   }
 }
 
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+  MachineFunction *MF = MBB.getParent();
+
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
+  LiveRegs.addLiveIns(MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   if (FuncInfo->isEntryFunction()) {
     emitEntryFunctionPrologue(MF, MBB);
     return;
@@ -498,6 +533,7 @@ void SIFrameLowering::emitPrologue(Machi
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -505,8 +541,36 @@ void SIFrameLowering::emitPrologue(Machi
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
+  // XXX - Is this the right predicate?
+
   bool NeedFP = hasFP(MF);
-  if (NeedFP) {
+  uint32_t NumBytes = MFI.getStackSize();
+  uint32_t RoundedSize = NumBytes;
+  const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+  if (NeedsRealignment) {
+    assert(NeedFP);
+    const unsigned Alignment = MFI.getMaxAlignment();
+    const unsigned ZeroLowBits = countTrailingZeros(Alignment);
+    assert(ZeroLowBits > 1);
+
+    RoundedSize += Alignment;
+
+    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+    assert(ScratchSPReg != AMDGPU::NoRegister);
+
+    // s_add_u32 tmp_reg, s32, NumBytes
+    // s_and_b32 s32, tmp_reg, 0b111...0000
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+      .addReg(StackPtrReg)
+      .addImm((Alignment - 1) * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+      .addReg(ScratchSPReg, RegState::Kill)
+      .addImm(-Alignment * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    FuncInfo->setIsStackRealigned(true);
+  } else if (NeedFP) {
     // If we need a base pointer, set it up here. It's whatever the value of
     // the stack pointer is at this point. Any variable size objects will be
     // allocated after this, so we can still use the base pointer to reference
@@ -516,11 +580,10 @@ void SIFrameLowering::emitPrologue(Machi
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  uint32_t NumBytes = MFI.getStackSize();
-  if (NumBytes != 0 && hasSP(MF)) {
+  if (RoundedSize != 0 && hasSP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
+      .addImm(RoundedSize * ST.getWavefrontSize())
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -566,10 +629,12 @@ void SIFrameLowering::emitEpilogue(Machi
   // it's really whether we need SP to be accurate or not.
 
   if (NumBytes != 0 && hasSP(MF)) {
+    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+      NumBytes + MFI.getMaxAlignment() : NumBytes;
+
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameDestroy);
+      .addImm(RoundedSize * ST.getWavefrontSize());
   }
 }
 
@@ -759,7 +824,8 @@ bool SIFrameLowering::hasFP(const Machin
 }
 
 bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
   // All stack operations are relative to the frame offset SGPR.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
 }

Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h?rev=328831&r1=328830&r2=328831&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h Thu Mar 29 14:30:06 2018
@@ -142,6 +142,7 @@ private:
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
   bool HasNonSpillStackObjects = false;
+  bool IsStackRealigned = false;
 
   unsigned NumSpilledSGPRs = 0;
   unsigned NumSpilledVGPRs = 0;
@@ -495,6 +496,14 @@ public:
     HasNonSpillStackObjects = StackObject;
   }
 
+  bool isStackRealigned() const {
+    return IsStackRealigned;
+  }
+
+  void setIsStackRealigned(bool Realigned = true) {
+    IsStackRealigned = Realigned;
+  }
+
   unsigned getNumSpilledSGPRs() const {
     return NumSpilledSGPRs;
   }

Added: llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll?rev=328831&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll Thu Mar 29 14:30:06 2018
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that we properly realign the stack. While 4-byte access is all
+; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g.
+
+
+; 128 byte object
+; 4 byte emergency stack slot
+; = 144 bytes with padding between them
+
+; GCN-LABEL: {{^}}needs_align16_default_stack_align:
+; GCN: s_mov_b32 s5, s32
+; GCN-NOT: s32
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN-NOT: s32
+
+; GCN: ; ScratchSize: 144
+define void @needs_align16_default_stack_align(i32 %idx) #0 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}needs_align16_stack_align4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffffc00
+; GCN: s_add_u32 s32, s32, 0x2800{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x2800
+
+; GCN: ; ScratchSize: 160
+define void @needs_align16_stack_align4(i32 %idx) #2 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}needs_align32:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffff800
+; GCN: s_add_u32 s32, s32, 0x3000{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x3000
+
+; GCN: ; ScratchSize: 192
+define void @needs_align32(i32 %idx) #0 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 32
+  ret void
+}
+
+; GCN-LABEL: {{^}}force_realign4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
+; GCN: s_and_b32 s5, s6, 0xffffff00
+; GCN: s_add_u32 s32, s32, 0xd00{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: s_sub_u32 s32, s32, 0xd00
+
+; GCN: ; ScratchSize: 52
+define void @force_realign4(i32 %idx) #1 {
+  %alloca.align16 = alloca [8 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile i32 3, i32 addrspace(5)* %gep0, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align16_from_8:
+; GCN: s_add_u32 s32, s8, 0x400{{$}}
+; GCN-NOT: s32
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 2, i32 addrspace(5)* %alloca
+  call void @needs_align16_default_stack_align(i32 1)
+  ret void
+}
+
+; The call sequence should keep the stack on call aligned to 4
+; GCN-LABEL: {{^}}kernel_call_align16_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_5() {
+  %alloca0 = alloca i8, align 1, addrspace(5)
+  store volatile i8 2, i8  addrspace(5)* %alloca0
+
+  call void @needs_align16_default_stack_align(i32 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align4_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align4_from_5() {
+  %alloca0 = alloca i8, align 1, addrspace(5)
+  store volatile i8 2, i8  addrspace(5)* %alloca0
+
+  call void @needs_align16_stack_align4(i32 1)
+  ret void
+}
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { noinline nounwind "stackrealign" }
+attributes #2 = { noinline nounwind alignstack=4 }