[llvm] cb7b661 - AMDGPU: Analyze divergence of inline asm

Mon Feb 3 12:42:30 PST 2020

Author: Matt Arsenault
Date: 2020-02-03T12:42:16-08:00
New Revision: cb7b661d3d3b547eaa377bdff0a0c94ea9e5458b

URL: https://github.com/llvm/llvm-project/commit/cb7b661d3d3b547eaa377bdff0a0c94ea9e5458b
DIFF: https://github.com/llvm/llvm-project/commit/cb7b661d3d3b547eaa377bdff0a0c94ea9e5458b.diff

LOG: AMDGPU: Analyze divergence of inline asm

Added: 
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/inline-asm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a255a49b26b6..514842b50647 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -578,8 +578,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-
-
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
@@ -606,6 +604,54 @@ static bool isArgPassedInSGPR(const Argument *A) {
   }
 }
 
+/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
+/// this is analyzing the collective result of all output registers. Otherwise,
+/// this is only querying a specific result index if this returns multiple
+/// registers in a struct.
+bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
+  const CallInst *CI, ArrayRef<unsigned> Indices) const {
+  // TODO: Handle complex extract indices
+  if (Indices.size() > 1)
+    return true;
+
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  ImmutableCallSite CS(CI);
+  TargetLowering::AsmOperandInfoVector TargetConstraints
+    = TLI->ParseConstraints(DL, ST->getRegisterInfo(), CS);
+
+  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
+
+  int OutputIdx = 0;
+  for (auto &TC : TargetConstraints) {
+    if (TC.Type != InlineAsm::isOutput)
+      continue;
+
+    // Skip outputs we don't care about.
+    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
+      continue;
+
+    TLI->ComputeConstraintToUse(TC, SDValue());
+
+    Register AssignedReg;
+    const TargetRegisterClass *RC;
+    std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
+      TRI, TC.ConstraintCode, TC.ConstraintVT);
+    if (AssignedReg) {
+      // FIXME: This is a workaround for getRegForInlineAsmConstraint
+      // returning VS_32
+      RC = TRI->getPhysRegClass(AssignedReg);
+    }
+
+    // For AGPR constraints null is returned on subtargets without AGPRs, so
+    // assume divergent for null.
+    if (!RC || !TRI->isSGPRClass(RC))
+      return true;
+  }
+
+  return false;
+}
+
 /// \returns true if the new GPU divergence analysis is enabled.
 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
   return !UseLegacyDA;
@@ -638,7 +684,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
 
   // Assume all function calls are a source of divergence.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (isa<InlineAsm>(CI->getCalledValue()))
+      return isInlineAsmSourceOfDivergence(CI);
+    return true;
+  }
+
+  // Assume all function calls are a source of divergence.
+  if (isa<InvokeInst>(V))
     return true;
 
   return false;
@@ -656,6 +709,19 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
       return true;
     }
   }
+
+  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
+  if (!ExtValue)
+    return false;
+
+  if (const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0))) {
+    // If we have inline asm returning mixed SGPR and VGPR results, we inferred
+    // divergent for the overall struct return. We need to override it in the
+    // case we're extracting an SGPR component here.
+    if (isa<InlineAsm>(CI->getCalledValue()))
+      return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
+  }
+
   return false;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 7dd692be5530..dee2e7d314d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -70,7 +70,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   friend BaseT;
 
   const GCNSubtarget *ST;
-  const AMDGPUTargetLowering *TLI;
+  const SITargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
   bool IsGraphicsShader;
   bool HasFP32Denormals;
@@ -183,6 +183,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   unsigned getCFInstrCost(unsigned Opcode);
 
+  bool isInlineAsmSourceOfDivergence(const CallInst *CI,
+                                     ArrayRef<unsigned> Indices = {}) const;
+
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
   bool isAlwaysUniform(const Value *V) const;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ea40c89ce639..77ae13d60281 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10586,6 +10586,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(RC->getRegister(Idx), RC);
     }
   }
+
+  // FIXME: Returns VS_32 for physical SGPR constraints
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
new file mode 100644
index 000000000000..8443b82f3888
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
@@ -0,0 +1,108 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx908 -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+; Make sure nothing crashes on targets with or without AGPRs
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_virtreg_output':
+; CHECK-NOT: DIVERGENT
+define i32 @inline_asm_1_sgpr_virtreg_output() {
+  %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
+  ret i32 %sgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_physreg_output':
+; CHECK-NOT: DIVERGENT
+define i32 @inline_asm_1_sgpr_physreg_output() {
+  %sgpr = call i32 asm "s_mov_b32 s0, 0", "={s0}"()
+  ret i32 %sgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_virtreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
+define i32 @inline_asm_1_vgpr_virtreg_output() {
+  %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_physreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
+define i32 @inline_asm_1_vgpr_physreg_output() {
+  %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_virtreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "; def $0", "=a"()
+define i32 @inline_asm_1_agpr_virtreg_output() {
+  %vgpr = call i32 asm "; def $0", "=a"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_physreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "; def a0", "={a0}"()
+define i32 @inline_asm_1_agpr_physreg_output() {
+  %vgpr = call i32 asm "; def a0", "={a0}"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_2_sgpr_virtreg_output':
+; CHECK-NOT: DIVERGENT
+define void @inline_asm_2_sgpr_virtreg_output() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s"()
+  %sgpr0 = extractvalue { i32, i32 } %asm, 0
+  %sgpr1 = extractvalue { i32, i32 } %asm, 1
+  store i32 %sgpr0, i32 addrspace(1)* undef
+  store i32 %sgpr1, i32 addrspace(1)* undef
+  ret void
+}
+
+; One output is SGPR, one is VGPR. Infer divergent for the aggregate, but uniform on the SGPR extract
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_sgpr_vgpr_virtreg_output':
+; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
+; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
+; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 1
+define void @inline_asm_sgpr_vgpr_virtreg_output() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
+  %sgpr = extractvalue { i32, i32 } %asm, 0
+  %vgpr = extractvalue { i32, i32 } %asm, 1
+  store i32 %sgpr, i32 addrspace(1)* undef
+  store i32 %vgpr, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output':
+; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
+; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 0
+; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
+define void @inline_asm_vgpr_sgpr_virtreg_output() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
+  %vgpr = extractvalue { i32, i32 } %asm, 0
+  %sgpr = extractvalue { i32, i32 } %asm, 1
+  store i32 %vgpr, i32 addrspace(1)* undef
+  store i32 %sgpr, i32 addrspace(1)* undef
+  ret void
+}
+
+; Have an extra output constraint
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'multi_sgpr_inline_asm_output_input_constraint':
+; CHECK-NOT: DIVERGENT
+define void @multi_sgpr_inline_asm_output_input_constraint() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s,s"(i32 1234)
+  %sgpr0 = extractvalue { i32, i32 } %asm, 0
+  %sgpr1 = extractvalue { i32, i32 } %asm, 1
+  store i32 %sgpr0, i32 addrspace(1)* undef
+  store i32 %sgpr1, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output_input_constraint':
+; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
+; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 0
+; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
+define void @inline_asm_vgpr_sgpr_virtreg_output_input_constraint() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
+  %vgpr = extractvalue { i32, i32 } %asm, 0
+  %sgpr = extractvalue { i32, i32 } %asm, 1
+  store i32 %vgpr, i32 addrspace(1)* undef
+  store i32 %sgpr, i32 addrspace(1)* undef
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index adf786c0e92a..48e12295989d 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -21,11 +21,30 @@ entry:
 }
 
 
-; CHECK: {{^}}branch_on_asm:
-; Make sure inline assembly is treted as divergent.
-; CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; CHECK-LABEL: {{^}}branch_on_asm_vgpr:
+; Make sure VGPR inline assembly is treated as divergent.
+; CHECK: v_mov_b32 v{{[0-9]+}}, 0
+; CHECK: v_cmp_eq_u32
 ; CHECK: s_and_saveexec_b64
-define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm_vgpr(i32 addrspace(1)* %out) {
+	%zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
+	%cmp = icmp eq i32 %zero, 0
+	br i1 %cmp, label %if, label %endif
+
+if:
+	store i32 0, i32 addrspace(1)* %out
+	br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}branch_on_asm_sgpr:
+; Make sure SGPR inline assembly is treated as uniform
+; CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; CHECK: s_cmp_lg_u32
+; CHECK: s_cbranch_scc0
+define amdgpu_kernel void @branch_on_asm_sgpr(i32 addrspace(1)* %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif