[llvm] [AMDGPU] support lib call (PR #74741)

Yaxun Liu via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 31 10:28:31 PST 2024


https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/74741

>From 71b9f554c9a34a1636f9a1a7a2891139fe4dbde4 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Thu, 7 Dec 2023 12:21:43 -0500
Subject: [PATCH] [AMDGPU] support lib call

During instruction selection some operations may be lowered
to call of library functions, e.g. 128 bit integer division
lowered to call of compiler-rt function __divti3. Currently
AMDGPU backend is unable to generate call of external
symbol and this results in isel failure and crash the
compiler.

This patch looks up the external symbol in the same module
and if it is found, converts the call of external symbol
to a normal function call so that the isel succeeds.

The library functions which are used to fullfill the library
function call generated by isel have function attribute
"amdgpu-lib-fun". They are not internalized, therefore
are kept until isel even if they are not used by user's code.

During isel, if calls of such library functions are generated,
they will be added function attribute "amdgpu-backend-used".
Otherwise, they will become empty to save time optimizing them,
and they will not be emitted by the assembler.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  4 ++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  3 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  1 +
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     | 17 ++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 69 ++++++++++++++++++-
 .../{div_i128.ll => div_i128_no_libfun.ll}    |  4 +-
 .../CodeGen/AMDGPU/div_i128_with_libfun.ll    | 44 ++++++++++++
 llvm/test/CodeGen/AMDGPU/unused_libfun.ll     | 22 ++++++
 8 files changed, 158 insertions(+), 6 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{div_i128.ll => div_i128_no_libfun.ll} (60%)
 create mode 100644 llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/unused_libfun.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 279ef8ca2751a..78d030f727df2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -454,6 +454,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction().hasFnAttribute("amdgpu-lib-fun") &&
+      !MF.getFunction().hasFnAttribute("amdgpu-backend-used"))
+    return false;
+
   // Init target streamer lazily on the first function so that previous passes
   // can set metadata.
   if (!IsTargetStreamerInitialized)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b420e72d87ed0..ad4db1923bb71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -528,6 +528,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
       setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
   }
+  // Supported compiler-rt libcalls should be enabled in compiler-rt for
+  // amdgcn first then added here.
+  setLibcallName(RTLIB::SDIV_I128, "__divti3");
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b8a7a5e208021..3d1dabfa6db43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -611,6 +611,7 @@ static bool mustPreserveGV(const GlobalValue &GV) {
   if (const Function *F = dyn_cast<Function>(&GV))
     return F->isDeclaration() || F->getName().starts_with("__asan_") ||
            F->getName().starts_with("__sanitizer_") ||
+           F->hasFnAttribute("amdgpu-lib-fun") ||
            AMDGPU::isEntryFunctionCC(F->getCallingConv());
 
   GV.removeDeadConstantUsers();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a812cdc61500c..c8491267b1c6c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -14,6 +14,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 
 #define DEBUG_TYPE "si-fold-operands"
@@ -2133,8 +2134,22 @@ bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
 }
 
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
+  auto &F = MF.getFunction();
+  if (skipFunction(F))
     return false;
+  // We cannot delete unused library function, but we can make it empty to
+  // save the time optimizing it.
+  if (F.hasFnAttribute("amdgpu-lib-fun") &&
+      !F.hasFnAttribute("amdgpu-backend-used") && F.use_empty()) {
+    while (!MF.empty()) {
+      MachineBasicBlock &MBB = MF.front();
+      MBB.clear();
+      MF.erase(&MBB);
+    }
+    // We need at least one BB to make the machine function valid.
+    MF.push_back(MF.CreateMachineBasicBlock());
+    return true;
+  }
 
   MRI = &MF.getRegInfo();
   ST = &MF.getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3d4adb16a2716..2e42545021f2c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -21,6 +21,8 @@
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
@@ -3536,6 +3538,48 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return true;
 }
 
+/// Add attribute to a function and functions called directly or indirectly by
+/// it.
+static void addAttributeToCalledFunctions(Function *F,
+                                          StringRef AttributeName) {
+  if (F->hasFnAttribute(AttributeName))
+    return;
+
+  SmallSet<Function *, 8> ProcessedFunctions;
+  SmallVector<Function *, 8> WorkList;
+
+  WorkList.push_back(F);
+
+  while (!WorkList.empty()) {
+    Function *Current = WorkList.back();
+    WorkList.pop_back();
+
+    if (ProcessedFunctions.find(Current) != ProcessedFunctions.end() ||
+        Current->hasFnAttribute(AttributeName)) {
+      continue;
+    }
+
+    ProcessedFunctions.insert(Current);
+    Current->addFnAttr(AttributeName);
+
+    for (auto &BasicBlock : *Current) {
+      for (auto &Instruction : BasicBlock) {
+        if (auto *CI = dyn_cast<CallInst>(&Instruction)) {
+          if (Function *CalledFunc = CI->getCalledFunction()) {
+            if (CalledFunc->isIntrinsic())
+              continue;
+
+            if (ProcessedFunctions.find(CalledFunc) ==
+                    ProcessedFunctions.end() &&
+                !CalledFunc->hasFnAttribute(AttributeName))
+              WorkList.push_back(CalledFunc);
+          }
+        }
+      }
+    }
+  }
+}
+
 // The wave scratch offset register is used as the global base pointer.
 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
@@ -3595,8 +3639,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported call to variadic function ");
   }
 
-  if (!CLI.CB)
-    report_fatal_error("unsupported libcall legalization");
+  if (!CLI.CB && Callee.getNode()->getOpcode() != ISD::ExternalSymbol)
+    report_fatal_error(
+        "unsupported libcall legalization: Callee function is unknown");
 
   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
     return lowerUnhandledCall(CLI, InVals,
@@ -3799,10 +3844,28 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
+  bool AddTargetGlobalAddr = true;
+  // During instruction selection, unsupported operations may be lowered to
+  // lib calls as call of external symbols, e.g. 128 bit signed integer devision
+  // lowered to call of __divti3. If we can find definition of the called
+  // function in the module, we can replace the external symbol with a normal
+  // function. Otherwise, emit an error for unsupported call to lib or external
+  // function.
+  if (isa<ExternalSymbolSDNode>(Callee)) {
+    Function *F = MF.getFunction().getParent()->getFunction(
+        cast<ExternalSymbolSDNode>(Callee)->getSymbol());
+    if (!F)
+      return lowerUnhandledCall(
+          CLI, InVals, "unsupported call to lib or external function ");
+    Callee = DAG.getSymbolFunctionGlobalAddress(Callee);
+    addAttributeToCalledFunctions(F, "amdgpu-backend-used");
+    AddTargetGlobalAddr = false;
+  }
   Ops.push_back(Callee);
   // Add a redundant copy of the callee global which will not be legalized, as
   // we need direct access to the callee later.
-  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (GSD && AddTargetGlobalAddr) {
     const GlobalValue *GV = GSD->getGlobal();
     Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
   } else {
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128_no_libfun.ll
similarity index 60%
rename from llvm/test/CodeGen/AMDGPU/div_i128.ll
rename to llvm/test/CodeGen/AMDGPU/div_i128_no_libfun.ll
index 4aa97c57cbd9c..b03f088d2f26b 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128_no_libfun.ll
@@ -1,7 +1,7 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
 
-; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
+; SDAG-ERR: error: {{.*}} in function v_sdiv_i128_vv i128 (i128, i128): unsupported call to lib or external function __divti3
 ; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_i128_vv)
 define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
   %shl = sdiv i128 %lhs, %rhs
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll b/llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll
new file mode 100644
index 0000000000000..f38350b53aef4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/div_i128_with_libfun.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=amdgcn -O3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_i128_sdiv:
+; GCN: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; GCN-NEXT: s_add_u32 s[[LO]], s[[LO]], __divti3 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[HI]], s[[HI]], __divti3 at rel32@hi+12
+; GCN: s_swappc_b64 s[[[RET_LO:[0-9]+]]:[[RET_HI:[0-9]+]]], s[[[LO]]:[[HI]]]
+
+; GCN-LABEL: {{^}}__divti3:
+; GCN: s_getpc_b64 s[[[LO2:[0-9]+]]:[[HI2:[0-9]+]]]
+; GCN-NEXT: s_add_u32 s[[LO2]], s[[LO2]], __divti3_impl at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[HI2]], s[[HI2]], __divti3_impl at rel32@hi+12
+; GCN: s_swappc_b64 s[[[RET_LO2:[0-9]+]]:[[RET_HI2:[0-9]+]]], s[[[LO2]]:[[HI2]]]
+; GCN: s_setpc_b64 s[[[RET_LO]]:[[RET_HI]]]
+
+; GCN-LABEL: {{^}}__divti3_impl:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: v_add_i32_e32 v0, vcc, v0, v4
+; GCN: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GCN: v_addc_u32_e32 v2, vcc, v2, v6, vcc
+; GCN: v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GCN: s_setpc_b64 s[[[RET_LO2]]:[[RET_HI2]]]
+
+define amdgpu_kernel void @test_i128_sdiv(ptr addrspace(1) %x, i128 %y, i128 %z) {
+entry:
+  %div = sdiv i128 %y, %z
+  store i128 %div, ptr addrspace(1) %x, align 16
+  ret void
+}
+
+; compiler-rt lib function for 128 bit signed division
+define hidden i128 @__divti3(i128 %a, i128 %b) #0 {
+entry:
+  %call = call i128 @__divti3_impl(i128 %a, i128 %b)
+  ret i128 %call
+}
+
+define hidden i128 @__divti3_impl(i128 %a, i128 %b) #0 {
+entry:
+  %add = add i128 %a, %b
+  ret i128 %add
+}
+
+attributes #0 = { "amdgpu-lib-fun" }
diff --git a/llvm/test/CodeGen/AMDGPU/unused_libfun.ll b/llvm/test/CodeGen/AMDGPU/unused_libfun.ll
new file mode 100644
index 0000000000000..bc7b22dc40d68
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unused_libfun.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -O3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_i128_add:
+; GCN-NOT: s_swappc_b64
+
+; GCN-NOT: {{^}}__divti3:
+
+define amdgpu_kernel void @test_i128_add(ptr addrspace(1) %x, i128 %y, i128 %z) {
+entry:
+  %add = add i128 %y, %z
+  store i128 %add, ptr addrspace(1) %x, align 16
+  ret void
+}
+
+; unused lib function should be removed
+define hidden i128 @__divti3(i128 %a, i128 %b) #0 {
+entry:
+  %add = add i128 %a, %b
+  ret i128 %add
+}
+
+attributes #0 = { "amdgpu-lib-fun" }



More information about the llvm-commits mailing list