[llvm] r330257 - [AMDGPU] Fix issues for backend divergence tracking

Wed Apr 18 06:53:31 PDT 2018

Author: dstuttard
Date: Wed Apr 18 06:53:31 2018
New Revision: 330257

URL: http://llvm.org/viewvc/llvm-project?rev=330257&view=rev
Log:
[AMDGPU] Fix issues for backend divergence tracking

Summary:
A change to use divergence analysis in the AMDGPU backend was getting formal
arguments incorrect (not tagged as divergent) unless they were VGPR0, VGPR1 or
VGPR2

For graphics shaders it is possible to have more than these passed in as VGPR

Modified the checking code to check for any VGPR registers passed in as formal
arguments.

Also, some intrinsics that are sources of divergence may have been lowered
during instruction selection and are missed on subsequent calls to
isSDNodeSourceOfDivergence - added the relevant AMDGPUISD checks as well.

Finally, the FunctionLoweringInfo tracks virtual registers that are live across
basic block boundaries. This is used to check for divergence of CopyFromRegister
registers using the DivergenceAnalysis analysis. For multiple blocks the lazily
evaluated inverted map VirtReg2Value was not cleared when the ValueMap map was.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D45372

Change-Id: I112f3bd6dfe0f62e63ce9b43b893982778e4bee3

Added:
    llvm/trunk/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
    llvm/trunk/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp?rev=330257&r1=330256&r2=330257&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp Wed Apr 18 06:53:31 2018
@@ -312,6 +312,7 @@ void FunctionLoweringInfo::set(const Fun
 void FunctionLoweringInfo::clear() {
   MBBMap.clear();
   ValueMap.clear();
+  VirtReg2Value.clear();
   StaticAllocaMap.clear();
   LiveOutRegInfo.clear();
   VisitedBBs.clear();

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=330257&r1=330256&r2=330257&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed Apr 18 06:53:31 2018
@@ -806,12 +806,11 @@ bool AMDGPUTargetLowering::isSDNodeSourc
 
         if (MRI.isLiveIn(Reg)) {
           // workitem.id.x workitem.id.y workitem.id.z
+          // Any VGPR formal argument is also considered divergent
           if ((MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_X) ||
               (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Y) ||
-              (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z)||
-              (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR0) ||
-            (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR1) ||
-            (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR2))
+              (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z) ||
+              (TRI.isVGPR(MRI, Reg)))
               return true;
           // Formal arguments of non-entry functions
           // are conservatively considered divergent
@@ -840,6 +839,12 @@ bool AMDGPUTargetLowering::isSDNodeSourc
     case ISD::INTRINSIC_W_CHAIN:
       return AMDGPU::isIntrinsicSourceOfDivergence(
       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    // In some cases intrinsics that are a source of divergence have been
+    // lowered to AMDGPUISD so we also need to check those too.
+    case AMDGPUISD::INTERP_MOV:
+    case AMDGPUISD::INTERP_P1:
+    case AMDGPUISD::INTERP_P2:
+      return true;
   }
   return false;
 }

Added: llvm/trunk/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll?rev=330257&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll Wed Apr 18 06:53:31 2018
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; A test case that originally failed in divergence calculation
+; Implementation has to identify all formal args that can be a source of divergence
+
+ at 0 = external dso_local addrspace(4) constant [6 x <2 x float>]
+
+; GCN-LABEL: {{^}}_amdgpu_vs_main:
+; GCN-NOT: v_readfirstlane
+; PRE-GFX9: flat_load_dword
+; GFX9: global_load 
+define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) local_unnamed_addr #0 {
+.entry:
+  %tmp = add i32 %arg4, %arg8
+  %tmp9 = sext i32 %tmp to i64
+  %tmp10 = getelementptr [6 x <2 x float>], [6 x <2 x float>] addrspace(4)* @0, i64 0, i64 %tmp9
+  %tmp11 = load <2 x float>, <2 x float> addrspace(4)* %tmp10, align 8
+  %tmp12 = fadd nnan arcp contract <2 x float> zeroinitializer, %tmp11
+  %tmp13 = extractelement <2 x float> %tmp12, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float undef, float %tmp13, float 0.000000e+00, float 1.000000e+00, i1 true, i1 false) #1
+  ret void
+}
+
+declare i64 @llvm.amdgcn.s.getpc() #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }

Added: llvm/trunk/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll?rev=330257&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll Wed Apr 18 06:53:31 2018
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; Testing for failures in divergence calculations when divergent intrinsic is lowered during instruction selection
+
+ at 0 = external dso_local addrspace(4) constant [4 x <4 x float>]
+
+; GCN-LABEL: {{^}}_amdgpu_ps_main:
+; GCN-NOT: v_readfirstlane
+; PRE-GFX9: flat_load_dword
+; GFX9: global_load 
+define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_addr #0 {
+.entry:
+  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg) #1
+  %tmp1 = bitcast float %tmp to i32
+  %tmp2 = srem i32 %tmp1, 4
+  %tmp3 = select i1 false, i32 undef, i32 %tmp2
+  %tmp4 = sext i32 %tmp3 to i64
+  %tmp5 = getelementptr [4 x <4 x float>], [4 x <4 x float>] addrspace(4)* @0, i64 0, i64 %tmp4
+  %tmp6 = load <4 x float>, <4 x float> addrspace(4)* %tmp5, align 16
+  %tmp7 = extractelement <4 x float> %tmp6, i32 3
+  %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) #1
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> undef, <2 x half> %tmp8, i1 true, i1 true) #2
+  ret void
+}
+
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #2
+
+attributes #0 = { nounwind "InitialPSInputAddr"="0" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind }