[llvm] 97d9a76 - [AMDGPU] Don't remove short branches over kills

Mon Feb 3 01:27:04 PST 2020

Author: Jay Foad
Date: 2020-02-03T09:26:52Z
New Revision: 97d9a76afc97f48f63056a9f36ecd6bf82774172

URL: https://github.com/llvm/llvm-project/commit/97d9a76afc97f48f63056a9f36ecd6bf82774172
DIFF: https://github.com/llvm/llvm-project/commit/97d9a76afc97f48f63056a9f36ecd6bf82774172.diff

LOG: [AMDGPU] Don't remove short branches over kills

Summary:
D68092 introduced a new SIRemoveShortExecBranches optimization pass and
broke some graphics shaders. The problem is that it was removing
branches over KILL pseudo instructions, and the fix is to explicitly
check for that in mustRetainExeczBranch.

Reviewers: critson, arsenm, nhaehnle, cdevadas, hakzsam

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73771

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
index aaadc1604e70..64fca0b46797 100644

--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
@@ -96,6 +96,9 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch(
       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
         return true;
 
+      if (TII->isKillTerminator(I->getOpcode()))
+        return true;
+
       // These instructions are potentially expensive even if EXEC = 0.
       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
           I->getOpcode() == AMDGPU::S_WAITCNT)

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 115782863efc..e5c0bea3e0dc 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -396,8 +396,52 @@ bb9:                                              ; preds = %bb4
   ret void
 }
 
+; CHECK-LABEL: {{^}}cbranch_kill:
+; CHECK-NOT: exp null off, off, off, off done vm
+define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) {
+.entry:
+  %val0 = extractelement <2 x float> %1, i32 0
+  %val1 = extractelement <2 x float> %1, i32 1
+  %p0 = call float @llvm.amdgcn.interp.p1(float %val0, i32 immarg 0, i32 immarg 1, i32 %0) #2
+  %sample = call float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 1, float %p0, float %p0, float %p0, float 0.000000e+00, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
+  %cond0 = fcmp ugt float %sample, 0.000000e+00
+  br i1 %cond0, label %live, label %kill
+
+kill:
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %export
+
+live:
+  %i0 = call float @llvm.amdgcn.interp.p1(float %val0, i32 immarg 0, i32 immarg 0, i32 %0) #2
+  %i1 = call float @llvm.amdgcn.interp.p2(float %i0, float %val1, i32 immarg 0, i32 immarg 0, i32 %0) #2
+  %i2 = call float @llvm.amdgcn.interp.p1(float %val0, i32 immarg 1, i32 immarg 0, i32 %0) #2
+  %i3 = call float @llvm.amdgcn.interp.p2(float %i2, float %val1, i32 immarg 1, i32 immarg 0, i32 %0) #2
+  %scale.i0 = fmul reassoc nnan nsz arcp contract float %i0, %sample
+  %scale.i1 = fmul reassoc nnan nsz arcp contract float %i1, %sample
+  %scale.i2 = fmul reassoc nnan nsz arcp contract float %i2, %sample
+  %scale.i3 = fmul reassoc nnan nsz arcp contract float %i3, %sample
+  br label %export
+
+export:
+  %proxy.0.0 = phi float [ undef, %kill ], [ %scale.i0, %live ]
+  %proxy.0.1 = phi float [ undef, %kill ], [ %scale.i1, %live ]
+  %proxy.0.2 = phi float [ undef, %kill ], [ %scale.i2, %live ]
+  %proxy.0.3 = phi float [ undef, %kill ], [ %scale.i3, %live ]
+  %out.0 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %proxy.0.0, float %proxy.0.1) #2
+  %out.1 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %proxy.0.2, float %proxy.0.3) #2
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> %out.0, <2 x half> %out.1, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #2
+declare float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
 declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare void @llvm.amdgcn.kill(i1) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind writeonly }