[llvm] 096cd99 - AMDGPU: Fix divergence analysis of control flow intrinsics

Wed Feb 5 09:31:40 PST 2020

Author: Matt Arsenault
Date: 2020-02-05T09:30:54-08:00
New Revision: 096cd991ee90875603a9cacf3b460ac677258539

URL: https://github.com/llvm/llvm-project/commit/096cd991ee90875603a9cacf3b460ac677258539
DIFF: https://github.com/llvm/llvm-project/commit/096cd991ee90875603a9cacf3b460ac677258539.diff

LOG: AMDGPU: Fix divergence analysis of control flow intrinsics

The mask results of these should be uniform. The trickier part is the
dummy booleans used as IR glue need to be treated as divergent. This
should make the divergence analysis results correct for the IR the DAG
is constructed from.

This should allow us to eliminate requiresUniformRegister, which has
an expensive, recursive scan over all users looking for control flow
intrinsics. This should avoid recent compile time regressions.

Added: 
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8d70536ec21c..a7eb081d1a25 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -270,5 +270,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
 
+// The dummy boolean output is divergent from the IR's perspective,
+// but the mask results are uniform. These produce a divergent and
+// uniform result, so the returned struct is collectively divergent.
+// isAlwaysUniform can override the extract of the uniform component.
+def : SourceOfDivergence<int_amdgcn_if>;
+def : SourceOfDivergence<int_amdgcn_else>;
+def : SourceOfDivergence<int_amdgcn_loop>;
+
 foreach intr = AMDGPUImageDimAtomicIntrinsics in
 def : SourceOfDivergence<intr>;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b15ef767f0ed..15ee82b5be6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -706,6 +706,7 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
     case Intrinsic::amdgcn_readlane:
     case Intrinsic::amdgcn_icmp:
     case Intrinsic::amdgcn_fcmp:
+    case Intrinsic::amdgcn_if_break:
       return true;
     }
   }
@@ -720,14 +721,28 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   if (!ExtValue)
     return false;
 
-  if (const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0))) {
-    // If we have inline asm returning mixed SGPR and VGPR results, we inferred
-    // divergent for the overall struct return. We need to override it in the
-    // case we're extracting an SGPR component here.
-    if (isa<InlineAsm>(CI->getCalledValue()))
-      return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
+  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
+  if (!CI)
+    return false;
+
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_if:
+    case Intrinsic::amdgcn_else: {
+      ArrayRef<unsigned> Indices = ExtValue->getIndices();
+      return Indices.size() == 1 && Indices[0] == 1;
+    }
+    }
   }
 
+  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
+  // divergent for the overall struct return. We need to override it in the
+  // case we're extracting an SGPR component here.
+  if (isa<InlineAsm>(CI->getCalledValue()))
+    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
+
   return false;
 }
 

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll
new file mode 100644
index 000000000000..9446a7e8e9f0
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/control-flow-intrinsics.ll
@@ -0,0 +1,102 @@
+; RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; Tests control flow intrinsics that should be treated as uniform
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_if_break':
+; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
+; CHECK-NOT: DIVERGENT
+; CHECK: ret void
+define amdgpu_ps void @test_if_break(i32 %arg0, i64 inreg %saved) {
+entry:
+  %cond = icmp eq i32 %arg0, 0
+  %break = call i64 @llvm.amdgcn.if.break.i64.i64(i1 %cond, i64 %saved)
+  store volatile i64 %break, i64 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_if':
+; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
+; CHECK-NEXT: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
+; CHECK-NOT: DIVERGENT
+; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
+define void @test_if(i32 %arg0) {
+entry:
+  %cond = icmp eq i32 %arg0, 0
+  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+  %if.bool = extractvalue { i1, i64 } %if, 0
+  %if.mask = extractvalue { i1, i64 } %if, 1
+  %if.bool.ext = zext i1 %if.bool to i32
+  store volatile i32 %if.bool.ext, i32 addrspace(1)* undef
+  store volatile i64 %if.mask, i64 addrspace(1)* undef
+  ret void
+}
+
+; The result should still be treated as divergent, even with a uniform source.
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_if_uniform':
+; CHECK-NOT: DIVERGENT
+; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
+; CHECK-NOT: DIVERGENT
+; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
+define amdgpu_ps void @test_if_uniform(i32 inreg %arg0) {
+entry:
+  %cond = icmp eq i32 %arg0, 0
+  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+  %if.bool = extractvalue { i1, i64 } %if, 0
+  %if.mask = extractvalue { i1, i64 } %if, 1
+  %if.bool.ext = zext i1 %if.bool to i32
+  store volatile i32 %if.bool.ext, i32 addrspace(1)* undef
+  store volatile i64 %if.mask, i64 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_loop_uniform':
+; CHECK: DIVERGENT: %loop = call i1 @llvm.amdgcn.loop.i64(i64 %mask)
+define amdgpu_ps void @test_loop_uniform(i64 inreg %mask) {
+entry:
+  %loop = call i1 @llvm.amdgcn.loop.i64(i64 %mask)
+  %loop.ext = zext i1 %loop to i32
+  store volatile i32 %loop.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_else':
+; CHECK: DIVERGENT: %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+; CHECK: DIVERGENT:       %else.bool = extractvalue { i1, i64 } %else, 0
+; CHECK: {{^[ \t]+}}%else.mask = extractvalue { i1, i64 } %else, 1
+define amdgpu_ps void @test_else(i64 inreg %mask) {
+entry:
+  %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+  %else.bool = extractvalue { i1, i64 } %else, 0
+  %else.mask = extractvalue { i1, i64 } %else, 1
+  %else.bool.ext = zext i1 %else.bool to i32
+  store volatile i32 %else.bool.ext, i32 addrspace(1)* undef
+  store volatile i64 %else.mask, i64 addrspace(1)* undef
+  ret void
+}
+
+; This case is probably always broken
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_else_divergent_mask':
+; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
+; CHECK-NOT: DIVERGENT
+; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
+define void @test_else_divergent_mask(i64 %mask) {
+entry:
+  %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+  %if.bool = extractvalue { i1, i64 } %if, 0
+  %if.mask = extractvalue { i1, i64 } %if, 1
+  %if.bool.ext = zext i1 %if.bool to i32
+  store volatile i32 %if.bool.ext, i32 addrspace(1)* undef
+  store volatile i64 %if.mask, i64 addrspace(1)* undef
+  ret void
+}
+
+declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #0
+declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #0
+declare i64 @llvm.amdgcn.if.break.i64.i64(i1, i64) #1
+declare i1 @llvm.amdgcn.loop.i64(i64) #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }