[clang] [llvm] [opt][AMDGPU] Add pass to handle AMDGCN pseudo-intrinsics target specific info), start with `llvm.amdgcn.wavefrontsize`. (PR #114481)

Thu Oct 31 15:52:00 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Alex Voicu (AlexVlx)

<details>
<summary>Changes</summary>

Pseudo-intrinsics are a mechanism for conveying that some target specific constant info / quantity exists, without inducing AST mutation based on said quantity. They enable making e.g. control flow decisions in a HLL that end up with a constant evaluated predicate in the ME and optimised accordingly, without making the AST (even more) target specific. `llvm.amdgcn.wavefrontsize` is an example of such a pseudo-intrinsic, but we will add more in the near future.

This change adds an `opt` pass that will handle pseudo-intrinsics as early as possible (roughly immediately after Clang CodeGen), so that all subsequent processing over IR is done with target specific info in place. For the time being it handles only `llvm.amdgcn.wavefrontsize`, replacing all calls to it with to the actual hardware quantity for concrete targets.

---
Full diff: https://github.com/llvm/llvm-project/pull/114481.diff


7 Files Affected:

- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn.cl (+3-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+9) 
- (added) llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp (+49) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+2-1) 
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll (+73-26) 


``````````diff

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bf5f2971cf118c..de6a06dad6a08d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
 
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
 // CHECK-LABEL test_wavefrontsize(
 unsigned test_wavefrontsize() {
 
-  // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+  // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+  // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
   return __builtin_amdgcn_wavefrontsize();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..17d3e6ab7c65ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,6 +345,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
+struct AMDGPUExpandPseudoIntrinsicsPass
+    : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
+  const AMDGPUTargetMachine &TM;
+  AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 00000000000000..faa23bb8550dbc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,49 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
+                                                        Function *WaveSize) {
+  if (WaveSize->hasZeroLiveUses())
+    return PreservedAnalyses::all();
+
+  for (auto &&U : WaveSize->users())
+    U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
+                                           ST.getWavefrontSize()));
+
+  return PreservedAnalyses::none();
+}
+
+PreservedAnalyses
+  AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+
+  if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
+    return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..323c195c329168 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint",
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
+            AMDGPUExpandPseudoIntrinsicsPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d93ec34a703d3d..2bf8df6588c59c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #include "llvm/Passes/TargetPassRegistry.inc"
 
   PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
         FunctionPassManager FPM;
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         if (EnableHipStdPar)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..c9d4452b4a035c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUExpandPseudoIntrinsics.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 824d3708c027db..efa53def5ee686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
@@ -5,28 +6,43 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
 ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
 
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -34,20 +50,35 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT:   store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
@@ -57,15 +88,31 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT:       bb:
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   bb3:
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32

``````````

</details>


https://github.com/llvm/llvm-project/pull/114481