[clang] [llvm] [llvm][AMDGPU] Fold `llvm.amdgcn.wavefrontsize` early (PR #114481)
Alex Voicu via cfe-commits
cfe-commits at lists.llvm.org
Sun Nov 24 14:07:28 PST 2024
https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/114481
>From 3ba88ce598aaab269169f0a5db5981c9a9ac8603 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 31 Oct 2024 22:38:36 +0000
Subject: [PATCH 01/11] Add pass to handle AMDGCN pseudo-intrinsics (abstract
placeholders for target specific info), and add handling for
`llvm.amdgcn.wavefrontsize`.
---
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 5 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 ++
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 49 +++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 99 ++++++++++++++-----
7 files changed, 139 insertions(+), 29 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bf5f2971cf118c..de6a06dad6a08d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
// CHECK-LABEL test_wavefrontsize(
unsigned test_wavefrontsize() {
- // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+ // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+ // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
return __builtin_amdgcn_wavefrontsize();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..17d3e6ab7c65ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,6 +345,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
extern char &AMDGPUResourceUsageAnalysisID;
+struct AMDGPUExpandPseudoIntrinsicsPass
+ : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+ static bool isRequired() { return true; }
+};
+
struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 00000000000000..faa23bb8550dbc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,49 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
+ Function *WaveSize) {
+ if (WaveSize->hasZeroLiveUses())
+ return PreservedAnalyses::all();
+
+ for (auto &&U : WaveSize->users())
+ U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
+ ST.getWavefrontSize()));
+
+ return PreservedAnalyses::none();
+}
+
+PreservedAnalyses
+ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+
+ if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
+ return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..323c195c329168 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint",
*static_cast<const GCNTargetMachine *>(this)))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
+ AMDGPUExpandPseudoIntrinsicsPass(*this))
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d93ec34a703d3d..2bf8df6588c59c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#include "llvm/Passes/TargetPassRegistry.inc"
PB.registerPipelineStartEPCallback(
- [](ModulePassManager &PM, OptimizationLevel Level) {
+ [this](ModulePassManager &PM, OptimizationLevel Level) {
+ PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
FunctionPassManager FPM;
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
if (EnableHipStdPar)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..c9d4452b4a035c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
+ AMDGPUExpandPseudoIntrinsics.cpp
AMDGPUExportClustering.cpp
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 824d3708c027db..efa53def5ee686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
@@ -5,28 +6,43 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
+; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -34,20 +50,35 @@ bb:
}
; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: %tmp1 = icmp ugt i32 %tmp, 32
-; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT: store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
@@ -57,15 +88,31 @@ bb:
}
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT: bb:
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: %tmp1 = icmp ugt i32 %tmp, 32
-; OPT: bb3:
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT: [[BB2]]:
+; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: br label %[[BB3]]
+; OPT: [[BB3]]:
+; OPT-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
>From 826c291f59f05cb7065dceb6052f3d8b7bf33f57 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 01:01:19 +0000
Subject: [PATCH 02/11] Implement review feedback.
---
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index faa23bb8550dbc..b46097bbd33e99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
@@ -33,17 +34,22 @@ static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
return PreservedAnalyses::all();
for (auto &&U : WaveSize->users())
- U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
- ST.getWavefrontSize()));
+ U->replaceAllUsesWith(
+ ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize()));
return PreservedAnalyses::none();
}
PreservedAnalyses
- AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+ if (M.empty())
+ return PreservedAnalyses::all();
+
+ const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
- if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
- return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+ if (auto WS =
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
+ return expandWaveSizeIntrinsic(ST, WS);
return PreservedAnalyses::all();
}
>From ab6f5a22a2442468f2ef0a7f18239f858b6320b7 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 02:02:08 +0000
Subject: [PATCH 03/11] Do not fold early for `generic` mcpu.
---
llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++++
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 8 ++++----
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index b46097bbd33e99..fb2ef7b7ed2d71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -47,6 +47,10 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
+ // This is not a concrete target, we should not fold early.
+ if (ST.getCPU().empty() || ST.getCPU() == "generic")
+ return PreservedAnalyses::all();
+
if (auto WS =
Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
return expandWaveSizeIntrinsic(ST, WS);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index efa53def5ee686..2d060fd4305077 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -6,10 +6,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
>From f8705fbe9f9c78148ca0a0360caf2650ab546185 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 02:07:03 +0000
Subject: [PATCH 04/11] Fix formatting (again).
---
llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index fb2ef7b7ed2d71..bf0ec39ab6c6e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -51,8 +51,8 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
if (ST.getCPU().empty() || ST.getCPU() == "generic")
return PreservedAnalyses::all();
- if (auto WS =
- Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
+ if (auto WS = Intrinsic::getDeclarationIfExists(
+ &M, Intrinsic::amdgcn_wavefrontsize))
return expandWaveSizeIntrinsic(ST, WS);
return PreservedAnalyses::all();
>From 026ed0092adf5c8a8b08b1772338c08ed501b54a Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:27:45 +0200
Subject: [PATCH 05/11] Remove pass, fold in InstCombine.
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 9 +++++++++
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +----
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 6 +++---
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a0..d952103aa81fdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
break;
}
+ case Intrinsic::amdgcn_wavefrontsize: {
+ // TODO: this is a workaround for the pseudo-generic target one gets with no
+ // specified mcpu, which spoofs its wave size to 64; it should be removed.
+ if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+ !ST->getFeatureString().contains("+wavefrontsize"))
+ break;
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(),
+ ST->getWavefrontSize()));
+ }
case Intrinsic::amdgcn_wqm_vote: {
// wqm_vote is identity when the argument is constant.
if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 851e0b25ad1625..86d8dbe4d803cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -744,10 +744,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#include "llvm/Passes/TargetPassRegistry.inc"
PB.registerPipelineStartEPCallback(
- [this](ModulePassManager &PM, OptimizationLevel Level) {
- PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
- FunctionPassManager FPM;
- PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ [](ModulePassManager &PM, OptimizationLevel Level) {
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 2d060fd4305077..f1aed3dc00c100 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -7,9 +7,9 @@
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
>From 195decc90bbdc1996d04bdf0ef4fe18f0d1953c2 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:53:46 +0200
Subject: [PATCH 06/11] Remove leftovers.
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 ---------
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 --
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 -
3 files changed, 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 17d3e6ab7c65ab..95d0ad0f9dc96a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,15 +345,6 @@ extern char &AMDGPUPrintfRuntimeBindingID;
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
extern char &AMDGPUResourceUsageAnalysisID;
-struct AMDGPUExpandPseudoIntrinsicsPass
- : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
- const AMDGPUTargetMachine &TM;
- AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
- static bool isRequired() { return true; }
-};
-
struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 323c195c329168..174a90f0aa419d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,8 +27,6 @@ MODULE_PASS("amdgpu-perf-hint",
*static_cast<const GCNTargetMachine *>(this)))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
-MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
- AMDGPUExpandPseudoIntrinsicsPass(*this))
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c9d4452b4a035c..fed29c3e14aae2 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
- AMDGPUExpandPseudoIntrinsics.cpp
AMDGPUExportClustering.cpp
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
>From 1a7abaffc499ff8d54bc7b1fd76ca2fdf78b92a0 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:54:21 +0200
Subject: [PATCH 07/11] Remove pass.
---
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 59 -------------------
1 file changed, 59 deletions(-)
delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
deleted file mode 100644
index bf0ec39ab6c6e7..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic pseudo-
-// intrinsics into target specific quantities / sequences. In this context, a
-// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
-// specific instruction, but rather is intended as a mechanism for abstractly
-// conveying target specific info to a HLL / the FE, without concretely
-// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
-// This pass should run as early as possible / immediately after Clang CodeGen,
-// so that the optimisation pipeline and the BE operate with concrete target
-// data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
- Function *WaveSize) {
- if (WaveSize->hasZeroLiveUses())
- return PreservedAnalyses::all();
-
- for (auto &&U : WaveSize->users())
- U->replaceAllUsesWith(
- ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize()));
-
- return PreservedAnalyses::none();
-}
-
-PreservedAnalyses
-AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
- if (M.empty())
- return PreservedAnalyses::all();
-
- const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
-
- // This is not a concrete target, we should not fold early.
- if (ST.getCPU().empty() || ST.getCPU() == "generic")
- return PreservedAnalyses::all();
-
- if (auto WS = Intrinsic::getDeclarationIfExists(
- &M, Intrinsic::amdgcn_wavefrontsize))
- return expandWaveSizeIntrinsic(ST, WS);
-
- return PreservedAnalyses::all();
-}
>From 9aed76ceb02fd2a1b1edf68e65f9bdac6de0509e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:59:28 +0200
Subject: [PATCH 08/11] Fix formatting.
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index d952103aa81fdb..ae5b1292921d1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1030,7 +1030,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
!ST->getFeatureString().contains("+wavefrontsize"))
break;
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(),
+ return IC.replaceInstUsesWith(
+ II, ConstantInt::get(II.getType(),
ST->getWavefrontSize()));
}
case Intrinsic::amdgcn_wqm_vote: {
>From 246c22fb2afc9ad600d897771fce8a2dc28b7ed1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 20:38:03 +0200
Subject: [PATCH 09/11] Really fix formatting.
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ae5b1292921d1e..0b2548af72fc0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1031,8 +1031,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
!ST->getFeatureString().contains("+wavefrontsize"))
break;
return IC.replaceInstUsesWith(
- II, ConstantInt::get(II.getType(),
- ST->getWavefrontSize()));
+ II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
}
case Intrinsic::amdgcn_wqm_vote: {
// wqm_vote is identity when the argument is constant.
>From 7cf75589441350d6207717ab936ae61582adbc73 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 6 Nov 2024 20:55:15 +0200
Subject: [PATCH 10/11] Split tests.
---
.../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 76 +-----------
.../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 114 ++++++++++++++++++
2 files changed, 115 insertions(+), 75 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index f1aed3dc00c100..33dd2bd540ad06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,21 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-
; GCN-LABEL: {{^}}fold_wavefrontsize:
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
@@ -24,25 +12,7 @@
define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-NEXT: [[BB:.*:]]
-; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
-; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT: ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W64-NEXT: [[BB:.*:]]
-; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT: ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W32-NEXT: [[BB:.*:]]
-; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT: ret void
-;
+
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -58,27 +28,6 @@ bb:
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT: [[BB:.*:]]
-; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
-; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT: ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT: [[BB:.*:]]
-; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT: ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W32-NEXT: [[BB:.*:]]
-; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT: ret void
-;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
@@ -90,29 +39,6 @@ bb:
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT: [[BB:.*:]]
-; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
-; OPT: [[BB2]]:
-; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT: br label %[[BB3]]
-; OPT: [[BB3]]:
-; OPT-NEXT: ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT: [[BB:.*:]]
-; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT: ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; OPT-W32-NEXT: [[BB:.*:]]
-; OPT-W32-NEXT: ret void
-;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
new file mode 100644
index 00000000000000..d9c105f753e264
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+ store i32 %tmp, ptr addrspace(1) %arg, align 4
+ ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+ %tmp1 = icmp ugt i32 %tmp, 32
+ %tmp2 = select i1 %tmp1, i32 2, i32 1
+ store i32 %tmp2, ptr addrspace(1) %arg
+ ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT: [[BB2]]:
+; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: br label %[[BB3]]
+; OPT: [[BB3]]:
+; OPT-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32: [[BB2]]:
+; OPT-W32-NEXT: br label %[[BB3]]
+; OPT-W32: [[BB3]]:
+; OPT-W32-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64: [[BB2]]:
+; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: br label %[[BB3]]
+; OPT-W64: [[BB3]]:
+; OPT-W64-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+ %tmp1 = icmp ugt i32 %tmp, 32
+ br i1 %tmp1, label %bb2, label %bb3
+
+bb2: ; preds = %bb
+ store i32 1, ptr addrspace(1) %arg, align 4
+ br label %bb3
+
+bb3: ; preds = %bb2, %bb
+ ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }
>From ed9f19f9154ae9868a12a78b9740523f727dc98c Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 18 Nov 2024 16:54:00 +0000
Subject: [PATCH 11/11] Tweak `generic` mcpu handling.
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 0b2548af72fc0d..688519e1fb6700 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1027,7 +1027,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
case Intrinsic::amdgcn_wavefrontsize: {
// TODO: this is a workaround for the pseudo-generic target one gets with no
// specified mcpu, which spoofs its wave size to 64; it should be removed.
- if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+ if ((ST->getCPU().empty() || ST->getCPU().starts_with("generic")) &&
!ST->getFeatureString().contains("+wavefrontsize"))
break;
return IC.replaceInstUsesWith(
More information about the cfe-commits
mailing list