[llvm] [AMDGPU][Attributor] Remove uniformity check in the indirect call specialization callback (PR #106177)

Mon Aug 26 21:29:02 PDT 2024

https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/106177

This patch removes the conservative uniformity check in the indirect call
specialization callback, as whether the function pointer is uniform doesn't
matter too much. Instead, we add an argument to control specialization.

>From 3d231174d3691aa81a17f0a55b95415cc8d4d1b2 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 27 Aug 2024 00:08:37 -0400
Subject: [PATCH] [AMDGPU][Attributor] Remove uniformity check in the indirect
 call specialization callback

This patch removes the conservative uniformity check in the indirect call
specialization callback, as whether the function pointer is uniform doesn't
matter too much. Instead, we add an argument to control specialization.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 21 +++----
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       | 14 ++++-
 .../CodeGen/AMDGPU/simple-indirect-call-2.ll  | 63 ++++++++++++++++---
 3 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 81932cc2c3c3bc..e339d93b00b043 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -14,7 +14,6 @@
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/CycleAnalysis.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
@@ -33,6 +32,12 @@ static cl::opt<unsigned> KernargPreloadCount(
     "amdgpu-kernarg-preload-count",
     cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
 
+static cl::opt<unsigned> IndirectCallSpecializationThreshold(
+    "amdgpu-indrect-call-specialization-threshold",
+    cl::desc(
+        "A threshold controls whether an indirect call will be specialized"),
+    cl::init(3));
+
 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
 
 enum ImplicitArgumentPositions {
@@ -1049,16 +1054,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   AC.IsModulePass = true;
   AC.DefaultInitializeLiveInternals = false;
   AC.IndirectCalleeSpecializationCallback =
-      [&TM](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
-            Function &Callee, unsigned NumAssumedCallees) {
-        if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv()))
-          return false;
-        // Singleton functions can be specialized.
-        if (NumAssumedCallees == 1)
-          return true;
-        // Otherwise specialize uniform values.
-        const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller());
-        return TTI.isAlwaysUniform(CB.getCalledOperand());
+      [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
+         Function &Callee, unsigned NumAssumedCallees) {
+        return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
+               (NumAssumedCallees <= IndirectCallSpecializationThreshold);
       };
   AC.IPOAmendableCB = [](const Function &F) {
     return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 33b1cc65dc5699..e5d440b96349f6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -231,7 +231,19 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
 ; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
-; CHECK-NEXT:    call void [[FPTR]]()
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @also_empty()
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    br i1 true, label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @empty()
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       5:
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    ret void
 ;
   %fptr = select i1 %cond, ptr @empty, ptr @also_empty
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 850446c414049d..aa090499c1531b 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck --check-prefixes=CHECK,OW %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='amdgpu-attributor<closed-world>' %s | FileCheck --check-prefixes=CHECK,CW %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='amdgpu-attributor<closed-world>' -amdgpu-indrect-call-specialization-threshold=0 %s | FileCheck --check-prefixes=CHECK,NO %s
 
 target datalayout = "A5"
 
@@ -9,8 +10,8 @@ target datalayout = "A5"
 ;.
 ; CHECK: @G = global i32 0, align 4
 ;.
-define void @bar() {
-; CHECK-LABEL: define {{[^@]+}}@bar
+define void @bar1() {
+; CHECK-LABEL: define {{[^@]+}}@bar1
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store i32 1, ptr @G, align 4
@@ -21,14 +22,36 @@ entry:
   ret void
 }
 
-define ptr @helper() {
-; CHECK-LABEL: define {{[^@]+}}@helper
+define void @bar2() {
+; CHECK-LABEL: define {{[^@]+}}@bar2
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret ptr @bar
+; CHECK-NEXT:    store i32 2, ptr @G, align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
-  ret ptr @bar
+  store i32 2, ptr @G, align 4
+  ret void
+}
+
+define ptr @helper1() {
+; CHECK-LABEL: define {{[^@]+}}@helper1
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret ptr @bar1
+;
+entry:
+  ret ptr @bar1
+}
+
+define ptr @helper2() {
+; CHECK-LABEL: define {{[^@]+}}@helper2
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret ptr @bar2
+;
+entry:
+  ret ptr @bar2
 }
 
 define amdgpu_kernel void @foo(ptr noundef %fp) {
@@ -45,10 +68,29 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
 ; CW-NEXT:  entry:
 ; CW-NEXT:    [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; CW-NEXT:    store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; CW-NEXT:    [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
-; CW-NEXT:    call void @bar()
+; CW-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[FP]], @bar1
+; CW-NEXT:    br i1 [[TMP0]], label [[TMP1:%.*]], label [[TMP2:%.*]]
+; CW:       1:
+; CW-NEXT:    call void @bar1()
+; CW-NEXT:    br label [[TMP5:%.*]]
+; CW:       2:
+; CW-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP4:%.*]]
+; CW:       3:
+; CW-NEXT:    call void @bar2()
+; CW-NEXT:    br label [[TMP5]]
+; CW:       4:
+; CW-NEXT:    unreachable
+; CW:       5:
 ; CW-NEXT:    ret void
 ;
+; NO-LABEL: define {{[^@]+}}@foo
+; NO-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] {
+; NO-NEXT:  entry:
+; NO-NEXT:    [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+; NO-NEXT:    store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
+; NO-NEXT:    call void [[FP]](), !callees [[META0:![0-9]+]]
+; NO-NEXT:    ret void
+;
 entry:
   %fp.addr = alloca ptr, addrspace(5)
   store ptr %fp, ptr addrspace(5) %fp.addr
@@ -57,6 +99,9 @@ entry:
   ret void
 }
 
+;.
+; NO: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
@@ -64,3 +109,5 @@ entry:
 ; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
+; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
+;.