[llvm] r371148 - AMDGPU: Fix emitting multiple stack loads for stack passed workitems

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 5 16:40:15 PDT 2019


Author: arsenm
Date: Thu Sep  5 16:40:14 2019
New Revision: 371148

URL: http://llvm.org/viewvc/llvm-project?rev=371148&view=rev
Log:
AMDGPU: Fix emitting multiple stack loads for stack passed workitems

The same stack is loaded for each workitem ID, and each use. Nothing
prevents you from creating multiple fixed stack objects with the same
offsets, so this was creating a load for each unique frame index,
despite them being the same offset. Re-use the same frame index so the
loads are CSEable.

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=371148&r1=371147&r2=371148&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Thu Sep  5 16:40:14 2019
@@ -4162,14 +4162,28 @@ SDValue AMDGPUTargetLowering::CreateLive
   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
 }
 
+// This may be called multiple times, and nothing prevents creating multiple
+// objects at the same offset. See if we already defined this object.
+static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
+                                       int64_t Offset) {
+  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+    if (MFI.getObjectOffset(I) == Offset) {
+      assert(MFI.getObjectSize(I) == Size);
+      return I;
+    }
+  }
+
+  return MFI.CreateFixedObject(Size, Offset, true);
+}
+
 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
                                                   EVT VT,
                                                   const SDLoc &SL,
                                                   int64_t Offset) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
 
-  int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll?rev=371148&r1=371147&r2=371148&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll Thu Sep  5 16:40:14 2019
@@ -543,19 +543,25 @@ define void @func_call_too_many_args_use
   ret void
 }
 
+; Only one stack load should be emitted for all 3 values.
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
 ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GCN: v_and_b32_e32 v32, 0x3ff, v32
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GCN: v_bfe_u32 v32, v32, 10, 10
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GCN: v_bfe_u32 v32, v32, 20, 10
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+; GCN-NOT: buffer_load_dword
+
+; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32
+; GCN-NOT: buffer_load_dword
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
+; GCN-NOT: buffer_load_dword
+; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
+; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
+; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
+; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
 
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @too_many_args_use_workitem_id_xyz(




More information about the llvm-commits mailing list