[clang] [llvm] [opt][AMDGPU] Add pass to handle AMDGCN pseudo-intrinsics, start with `llvm.amdgcn.wavefrontsize` (PR #114481)
Alex Voicu via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 4 09:28:04 PST 2024
https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/114481
>From 3ba88ce598aaab269169f0a5db5981c9a9ac8603 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 31 Oct 2024 22:38:36 +0000
Subject: [PATCH 1/5] Add pass to handle AMDGCN pseudo-intrinsics (abstract
placeholders for target specific info), and add handling for
`llvm.amdgcn.wavefrontsize`.
---
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 5 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 ++
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 49 +++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 99 ++++++++++++++-----
7 files changed, 139 insertions(+), 29 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bf5f2971cf118c..de6a06dad6a08d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
// CHECK-LABEL test_wavefrontsize(
unsigned test_wavefrontsize() {
- // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+ // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+ // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
return __builtin_amdgcn_wavefrontsize();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..17d3e6ab7c65ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,6 +345,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
extern char &AMDGPUResourceUsageAnalysisID;
+struct AMDGPUExpandPseudoIntrinsicsPass
+ : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+ static bool isRequired() { return true; }
+};
+
struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 00000000000000..faa23bb8550dbc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,49 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
+ Function *WaveSize) {
+ if (WaveSize->hasZeroLiveUses())
+ return PreservedAnalyses::all();
+
+ for (auto &&U : WaveSize->users())
+ U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
+ ST.getWavefrontSize()));
+
+ return PreservedAnalyses::none();
+}
+
+PreservedAnalyses
+ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+
+ if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
+ return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..323c195c329168 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint",
*static_cast<const GCNTargetMachine *>(this)))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
+ AMDGPUExpandPseudoIntrinsicsPass(*this))
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d93ec34a703d3d..2bf8df6588c59c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#include "llvm/Passes/TargetPassRegistry.inc"
PB.registerPipelineStartEPCallback(
- [](ModulePassManager &PM, OptimizationLevel Level) {
+ [this](ModulePassManager &PM, OptimizationLevel Level) {
+ PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
FunctionPassManager FPM;
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
if (EnableHipStdPar)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..c9d4452b4a035c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
+ AMDGPUExpandPseudoIntrinsics.cpp
AMDGPUExportClustering.cpp
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 824d3708c027db..efa53def5ee686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
@@ -5,28 +6,43 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
+; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -34,20 +50,35 @@ bb:
}
; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: %tmp1 = icmp ugt i32 %tmp, 32
-; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT: store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
@@ -57,15 +88,31 @@ bb:
}
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT: bb:
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: %tmp1 = icmp ugt i32 %tmp, 32
-; OPT: bb3:
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT: [[BB2]]:
+; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: br label %[[BB3]]
+; OPT: [[BB3]]:
+; OPT-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
>From 826c291f59f05cb7065dceb6052f3d8b7bf33f57 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 01:01:19 +0000
Subject: [PATCH 2/5] Implement review feedback.
---
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index faa23bb8550dbc..b46097bbd33e99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
@@ -33,17 +34,22 @@ static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
return PreservedAnalyses::all();
for (auto &&U : WaveSize->users())
- U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
- ST.getWavefrontSize()));
+ U->replaceAllUsesWith(
+ ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize()));
return PreservedAnalyses::none();
}
PreservedAnalyses
- AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+ if (M.empty())
+ return PreservedAnalyses::all();
+
+ const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
- if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
- return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+ if (auto WS =
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
+ return expandWaveSizeIntrinsic(ST, WS);
return PreservedAnalyses::all();
}
>From ab6f5a22a2442468f2ef0a7f18239f858b6320b7 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 02:02:08 +0000
Subject: [PATCH 3/5] Do not fold early for `generic` mcpu.
---
llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++++
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 8 ++++----
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index b46097bbd33e99..fb2ef7b7ed2d71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -47,6 +47,10 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
+ // This is not a concrete target, we should not fold early.
+ if (ST.getCPU().empty() || ST.getCPU() == "generic")
+ return PreservedAnalyses::all();
+
if (auto WS =
Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
return expandWaveSizeIntrinsic(ST, WS);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index efa53def5ee686..2d060fd4305077 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -6,10 +6,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
>From f8705fbe9f9c78148ca0a0360caf2650ab546185 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 02:07:03 +0000
Subject: [PATCH 4/5] Fix formatting (again).
---
llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index fb2ef7b7ed2d71..bf0ec39ab6c6e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -51,8 +51,8 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
if (ST.getCPU().empty() || ST.getCPU() == "generic")
return PreservedAnalyses::all();
- if (auto WS =
- Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
+ if (auto WS = Intrinsic::getDeclarationIfExists(
+ &M, Intrinsic::amdgcn_wavefrontsize))
return expandWaveSizeIntrinsic(ST, WS);
return PreservedAnalyses::all();
>From 026ed0092adf5c8a8b08b1772338c08ed501b54a Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:27:45 +0200
Subject: [PATCH 5/5] Remove pass, fold in InstCombine.
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 9 +++++++++
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +----
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 6 +++---
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a0..d952103aa81fdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
break;
}
+ case Intrinsic::amdgcn_wavefrontsize: {
+ // TODO: this is a workaround for the pseudo-generic target one gets with no
+ // specified mcpu, which spoofs its wave size to 64; it should be removed.
+ if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+ !ST->getFeatureString().contains("+wavefrontsize"))
+ break;
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(),
+ ST->getWavefrontSize()));
+ }
case Intrinsic::amdgcn_wqm_vote: {
// wqm_vote is identity when the argument is constant.
if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 851e0b25ad1625..86d8dbe4d803cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -744,10 +744,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#include "llvm/Passes/TargetPassRegistry.inc"
PB.registerPipelineStartEPCallback(
- [this](ModulePassManager &PM, OptimizationLevel Level) {
- PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
- FunctionPassManager FPM;
- PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ [](ModulePassManager &PM, OptimizationLevel Level) {
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 2d060fd4305077..f1aed3dc00c100 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -7,9 +7,9 @@
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
More information about the cfe-commits
mailing list