[llvm] [AMDGPU] support lib call (PR #74741)

Yaxun Liu via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 7 09:41:29 PST 2023


https://github.com/yxsamliu created https://github.com/llvm/llvm-project/pull/74741

During instruction selection some operations may be lowered to call of library functions, e.g. 128 bit integer division lowered to call of compiler-rt function __divti3. Currently AMDGPU backend is unable to generate call of external symbol and this results in isel failure and crash the compiler.

This patch looks up the external symbol in the same module and if it is found, converts the call of external symbol to a normal function call so that the isel succeeds.

The library functions which are used to fullfill the library function call generated by isel have function attribute "amdgpu-lib-fun". They are not internalized, therefore are kept until isel even if they are not used by user's code.

During isel, if calls of such library functions are generated, they will be added function attribute "amdgpu-backend-used". Otherwise, they will become empty to save time optimizing them, and they will not be emitted by the assembler.

>From 70b9c288b4ed0456469f2a76e7191221715d3b8e Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Thu, 7 Dec 2023 12:21:43 -0500
Subject: [PATCH] [AMDGPU] support lib call

During instruction selection some operations may be lowered
to call of library functions, e.g. 128 bit integer division
lowered to call of compiler-rt function __divti3. Currently
AMDGPU backend is unable to generate call of external
symbol and this results in isel failure and crash the
compiler.

This patch looks up the external symbol in the same module
and if it is found, converts the call of external symbol
to a normal function call so that the isel succeeds.

The library functions which are used to fullfill the library
function call generated by isel have function attribute
"amdgpu-lib-fun". They are not internalized, therefore
are kept until isel even if they are not used by user's code.

During isel, if calls of such library functions are generated,
they will be added function attribute "amdgpu-backend-used".
Otherwise, they will become empty to save time optimizing them,
and they will not be emitted by the assembler.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  4 ++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  4 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  1 +
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     | 17 ++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 69 ++++++++++++++++++-
 .../{div_i128.ll => div_i128_no_libfun.ll}    |  4 +-
 .../CodeGen/AMDGPU/div_i128_with_libfun.ll    | 44 ++++++++++++
 llvm/test/CodeGen/AMDGPU/unused_libfun.ll     | 22 ++++++
 8 files changed, 158 insertions(+), 7 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{div_i128.ll => div_i128_no_libfun.ll} (60%)
 create mode 100644 llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/unused_libfun.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4bf1f1357b694e..0f3be41032b8d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -452,6 +452,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction().hasFnAttribute("amdgpu-lib-fun") &&
+      !MF.getFunction().hasFnAttribute("amdgpu-backend-used"))
+    return false;
+
   // Init target streamer lazily on the first function so that previous passes
   // can set metadata.
   if (!IsTargetStreamerInitialized)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1fc..04da2f5376e4f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -506,9 +506,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
   AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
 
-  // There are no libcalls of any kind.
   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+  // Supported compiler-rt libcalls should be enabled in compiler-rt for
+  // amdgcn first then added here.
+  setLibcallName(RTLIB::SDIV_I128, "__divti3");
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0c38fa32c6f33a..484b6ecceca516 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -603,6 +603,7 @@ static bool mustPreserveGV(const GlobalValue &GV) {
   if (const Function *F = dyn_cast<Function>(&GV))
     return F->isDeclaration() || F->getName().startswith("__asan_") ||
            F->getName().startswith("__sanitizer_") ||
+           F->hasFnAttribute("amdgpu-lib-fun") ||
            AMDGPU::isEntryFunctionCC(F->getCallingConv());
 
   GV.removeDeadConstantUsers();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 709de612d81d4a..ff1ae6ade9eb44 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -14,6 +14,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 
 #define DEBUG_TYPE "si-fold-operands"
@@ -2038,8 +2039,22 @@ bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
 }
 
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
+  auto &F = MF.getFunction();
+  if (skipFunction(F))
     return false;
+  // We cannot delete unused library function, but we can make it empty to
+  // save the time optimizing it.
+  if (F.hasFnAttribute("amdgpu-lib-fun") &&
+      !F.hasFnAttribute("amdgpu-backend-used") && F.use_empty()) {
+    while (!MF.empty()) {
+      MachineBasicBlock &MBB = MF.front();
+      MBB.clear();
+      MF.erase(&MBB);
+    }
+    // We need at least one BB to make the machine function valid.
+    MF.push_back(MF.CreateMachineBasicBlock());
+    return true;
+  }
 
   MRI = &MF.getRegInfo();
   ST = &MF.getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..e49f25626b4a33 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -21,6 +21,8 @@
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
@@ -3372,6 +3374,48 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return true;
 }
 
+/// Add attribute to a function and functions called directly or indirectly by
+/// it.
+static void addAttributeToCalledFunctions(Function *F,
+                                          StringRef AttributeName) {
+  if (F->hasFnAttribute(AttributeName))
+    return;
+
+  SmallSet<Function *, 8> ProcessedFunctions;
+  SmallVector<Function *, 8> WorkList;
+
+  WorkList.push_back(F);
+
+  while (!WorkList.empty()) {
+    Function *Current = WorkList.back();
+    WorkList.pop_back();
+
+    if (ProcessedFunctions.find(Current) != ProcessedFunctions.end() ||
+        Current->hasFnAttribute(AttributeName)) {
+      continue;
+    }
+
+    ProcessedFunctions.insert(Current);
+    Current->addFnAttr(AttributeName);
+
+    for (auto &BasicBlock : *Current) {
+      for (auto &Instruction : BasicBlock) {
+        if (auto *CI = dyn_cast<CallInst>(&Instruction)) {
+          if (Function *CalledFunc = CI->getCalledFunction()) {
+            if (CalledFunc->isIntrinsic())
+              continue;
+
+            if (ProcessedFunctions.find(CalledFunc) ==
+                    ProcessedFunctions.end() &&
+                !CalledFunc->hasFnAttribute(AttributeName))
+              WorkList.push_back(CalledFunc);
+          }
+        }
+      }
+    }
+  }
+}
+
 // The wave scratch offset register is used as the global base pointer.
 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
@@ -3431,8 +3475,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported call to variadic function ");
   }
 
-  if (!CLI.CB)
-    report_fatal_error("unsupported libcall legalization");
+  if (!CLI.CB && Callee.getNode()->getOpcode() != ISD::ExternalSymbol)
+    report_fatal_error(
+        "unsupported libcall legalization: Callee function is unknown");
 
   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
     return lowerUnhandledCall(CLI, InVals,
@@ -3635,10 +3680,28 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
+  bool AddTargetGlobalAddr = true;
+  // During instruction selection, unsupported operations may be lowered to
+  // lib calls as call of external symbols, e.g. 128 bit signed integer devision
+  // lowered to call of __divti3. If we can find definition of the called
+  // function in the module, we can replace the external symbol with a normal
+  // function. Otherwise, emit an error for unsupported call to lib or external
+  // function.
+  if (isa<ExternalSymbolSDNode>(Callee)) {
+    Function *F = MF.getFunction().getParent()->getFunction(
+        cast<ExternalSymbolSDNode>(Callee)->getSymbol());
+    if (!F)
+      return lowerUnhandledCall(
+          CLI, InVals, "unsupported call to lib or external function ");
+    Callee = DAG.getSymbolFunctionGlobalAddress(Callee);
+    addAttributeToCalledFunctions(F, "amdgpu-backend-used");
+    AddTargetGlobalAddr = false;
+  }
   Ops.push_back(Callee);
   // Add a redundant copy of the callee global which will not be legalized, as
   // we need direct access to the callee later.
-  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (GSD && AddTargetGlobalAddr) {
     const GlobalValue *GV = GSD->getGlobal();
     Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
   } else {
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128_no_libfun.ll
similarity index 60%
rename from llvm/test/CodeGen/AMDGPU/div_i128.ll
rename to llvm/test/CodeGen/AMDGPU/div_i128_no_libfun.ll
index 4aa97c57cbd9c2..b03f088d2f26bb 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128_no_libfun.ll
@@ -1,7 +1,7 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
 
-; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
+; SDAG-ERR: error: {{.*}} in function v_sdiv_i128_vv i128 (i128, i128): unsupported call to lib or external function __divti3
 ; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_i128_vv)
 define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
   %shl = sdiv i128 %lhs, %rhs
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll b/llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll
new file mode 100644
index 00000000000000..f38350b53aef46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=amdgcn -O3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_i128_sdiv:
+; GCN: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; GCN-NEXT: s_add_u32 s[[LO]], s[[LO]], __divti3 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[HI]], s[[HI]], __divti3 at rel32@hi+12
+; GCN: s_swappc_b64 s[[[RET_LO:[0-9]+]]:[[RET_HI:[0-9]+]]], s[[[LO]]:[[HI]]]
+
+; GCN-LABEL: {{^}}__divti3:
+; GCN: s_getpc_b64 s[[[LO2:[0-9]+]]:[[HI2:[0-9]+]]]
+; GCN-NEXT: s_add_u32 s[[LO2]], s[[LO2]], __divti3_impl at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[HI2]], s[[HI2]], __divti3_impl at rel32@hi+12
+; GCN: s_swappc_b64 s[[[RET_LO2:[0-9]+]]:[[RET_HI2:[0-9]+]]], s[[[LO2]]:[[HI2]]]
+; GCN: s_setpc_b64 s[[[RET_LO]]:[[RET_HI]]]
+
+; GCN-LABEL: {{^}}__divti3_impl:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: v_add_i32_e32 v0, vcc, v0, v4
+; GCN: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GCN: v_addc_u32_e32 v2, vcc, v2, v6, vcc
+; GCN: v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GCN: s_setpc_b64 s[[[RET_LO2]]:[[RET_HI2]]]
+
+define amdgpu_kernel void @test_i128_sdiv(ptr addrspace(1) %x, i128 %y, i128 %z) {
+entry:
+  %div = sdiv i128 %y, %z
+  store i128 %div, ptr addrspace(1) %x, align 16
+  ret void
+}
+
+; compiler-rt lib function for 128 bit signed division
+define hidden i128 @__divti3(i128 %a, i128 %b) #0 {
+entry:
+  %call = call i128 @__divti3_impl(i128 %a, i128 %b)
+  ret i128 %call
+}
+
+define hidden i128 @__divti3_impl(i128 %a, i128 %b) #0 {
+entry:
+  %add = add i128 %a, %b
+  ret i128 %add
+}
+
+attributes #0 = { "amdgpu-lib-fun" }
diff --git a/llvm/test/CodeGen/AMDGPU/unused_libfun.ll b/llvm/test/CodeGen/AMDGPU/unused_libfun.ll
new file mode 100644
index 00000000000000..bc7b22dc40d68b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unused_libfun.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -O3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_i128_add:
+; GCN-NOT: s_swappc_b64
+
+; GCN-NOT: {{^}}__divti3:
+
+define amdgpu_kernel void @test_i128_add(ptr addrspace(1) %x, i128 %y, i128 %z) {
+entry:
+  %add = add i128 %y, %z
+  store i128 %add, ptr addrspace(1) %x, align 16
+  ret void
+}
+
+; unused lib function should be removed
+define hidden i128 @__divti3(i128 %a, i128 %b) #0 {
+entry:
+  %add = add i128 %a, %b
+  ret i128 %add
+}
+
+attributes #0 = { "amdgpu-lib-fun" }



More information about the llvm-commits mailing list