[llvm] 5dfdd34 - AMDGPU: Don't try to fold wavefrontsize intrinsic in libcall simplify

Tue Aug 1 15:20:57 PDT 2023

Author: Matt Arsenault
Date: 2023-08-01T18:20:50-04:00
New Revision: 5dfdd3494bf25b30c0161861fc080a17444e6f27

URL: https://github.com/llvm/llvm-project/commit/5dfdd3494bf25b30c0161861fc080a17444e6f27
DIFF: https://github.com/llvm/llvm-project/commit/5dfdd3494bf25b30c0161861fc080a17444e6f27.diff

LOG: AMDGPU: Don't try to fold wavefrontsize intrinsic in libcall simplify

It's not a libcall so doesn't really belong here to begin
with. Relying on checking the target name and explicit features isn't
particularly sound either. The library doesn't use the intrinsic
anymore, so it doesn't matter anyway.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b82db82de84ef2..86778ed301858f 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -48,7 +48,7 @@ FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
 
 FunctionPass *createSIPostRABundlerPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
+FunctionPass *createAMDGPUSimplifyLibCallsPass();
 FunctionPass *createAMDGPUUseNativeCallsPass();
 ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
 FunctionPass *createAMDGPUCodeGenPreparePass();
@@ -60,11 +60,8 @@ FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
-  AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
+  AMDGPUSimplifyLibCallsPass() {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-
-private:
-  TargetMachine &TM;
 };
 
 struct AMDGPUUseNativeCallsPass : PassInfoMixin<AMDGPUUseNativeCallsPass> {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 67d19b6deef003..cf08f072a01565 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -20,7 +20,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
 #include <cmath>
 
 #define DEBUG_TYPE "amdgpu-simplifylib"
@@ -49,8 +48,6 @@ class AMDGPULibCalls {
 
   typedef llvm::AMDGPULibFunc FuncInfo;
 
-  const TargetMachine *TM;
-
   bool UnsafeFPMath = false;
 
   // -fuse-native.
@@ -101,13 +98,11 @@ class AMDGPULibCalls {
   bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
                             const FuncInfo &FInfo);
 
-  // llvm.amdgcn.wavefrontsize
-  bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
-
   // Get insertion point at entry.
   BasicBlock::iterator getEntryIns(CallInst * UI);
   // Insert an Alloc instruction.
   AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
+
   // Get a scalar native builtin single argument FP function
   FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
 
@@ -126,7 +121,7 @@ class AMDGPULibCalls {
   }
 
 public:
-  AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+  AMDGPULibCalls() {}
 
   bool fold(CallInst *CI);
 
@@ -148,8 +143,7 @@ namespace {
   public:
     static char ID; // Pass identification
 
-    AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), Simplifier(TM) {
+    AMDGPUSimplifyLibCalls() : FunctionPass(ID) {
       initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -602,18 +596,8 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
 bool AMDGPULibCalls::fold(CallInst *CI) {
   Function *Callee = CI->getCalledFunction();
   // Ignore indirect calls.
-  if (!Callee || CI->isNoBuiltin())
-    return false;
-
-  IRBuilder<> B(CI);
-  switch (Callee->getIntrinsicID()) {
-  case Intrinsic::not_intrinsic:
-    break;
-  case Intrinsic::amdgcn_wavefrontsize:
-    return !EnablePreLink && fold_wavefrontsize(CI, B);
-  default:
+  if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
     return false;
-  }
 
   FuncInfo FInfo;
   if (!parseFunctionName(Callee->getName(), FInfo))
@@ -629,6 +613,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   if (TDOFold(CI, FInfo))
     return true;
 
+  IRBuilder<> B(CI);
+
   if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
     // Under unsafe-math, evaluate calls if possible.
     // According to Brian Sumner, we can do this for all f32 function calls
@@ -1310,28 +1296,6 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
   return true;
 }
 
-bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
-  if (!TM)
-    return false;
-
-  StringRef CPU = TM->getTargetCPU();
-  StringRef Features = TM->getTargetFeatureString();
-  if ((CPU.empty() || CPU.equals_insensitive("generic")) &&
-      (Features.empty() || !Features.contains_insensitive("wavefrontsize")))
-    return false;
-
-  Function *F = CI->getParent()->getParent();
-  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
-  unsigned N = ST.getWavefrontSize();
-
-  LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
-               << N << "\n");
-
-  CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
-  CI->eraseFromParent();
-  return true;
-}
-
 // Get insertion point at entry.
 BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
   Function * Func = UI->getParent()->getParent();
@@ -1642,8 +1606,8 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
 }
 
 // Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) {
-  return new AMDGPUSimplifyLibCalls(TM);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass() {
+  return new AMDGPUSimplifyLibCalls();
 }
 
 FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
@@ -1677,7 +1641,7 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
 
 PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
                                                   FunctionAnalysisManager &AM) {
-  AMDGPULibCalls Simplifier(&TM);
+  AMDGPULibCalls Simplifier;
   Simplifier.initNativeFuncs();
   Simplifier.initFunction(F);
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index bad6976f1ff9bc..2be85724a10109 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -631,7 +631,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       [this](StringRef PassName, FunctionPassManager &PM,
              ArrayRef<PassBuilder::PipelineElement>) {
         if (PassName == "amdgpu-simplifylib") {
-          PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+          PM.addPass(AMDGPUSimplifyLibCallsPass());
           return true;
         }
         if (PassName == "amdgpu-usenative") {
@@ -683,11 +683,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   });
 
   PB.registerPipelineStartEPCallback(
-      [this](ModulePassManager &PM, OptimizationLevel Level) {
+      [](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
         FPM.addPass(AMDGPUUseNativeCallsPass());
         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
-          FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+          FPM.addPass(AMDGPUSimplifyLibCallsPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 1347ef28a710d5..42fcdad49f7027 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -4,16 +4,16 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
 ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
@@ -22,10 +22,8 @@
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT-W32:   store i32 32, ptr addrspace(1) %arg, align 4
-; OPT-W64:   store i32 64, ptr addrspace(1) %arg, align 4
-; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   store i32 %tmp, ptr addrspace(1) %arg, align 4
+; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT:   store i32 %tmp, ptr addrspace(1) %arg, align 4
 ; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
@@ -43,12 +41,10 @@ bb:
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT-W32:   store i32 1, ptr addrspace(1) %arg, align 4
-; OPT-W64:   store i32 2, ptr addrspace(1) %arg, align 4
-; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT-WXX:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT-WXX:   store i32 %tmp2, ptr addrspace(1) %arg
+; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT:   %tmp2 = select i1 %tmp1, i32 2, i32 1
+; OPT:   store i32 %tmp2, ptr addrspace(1) %arg
 ; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
@@ -64,10 +60,9 @@ bb:
 ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
 
 ; OPT:       bb:
-; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT-WXX:   bb3:
-; OPT-W64:   store i32 1, ptr addrspace(1) %arg, align 4
+; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT:   bb3:
 ; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {