[llvm] [AMDGPU] Automatic conversion from wave32 to wave64 (PR #137376)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 8 12:10:50 PDT 2025
https://github.com/alex-t updated https://github.com/llvm/llvm-project/pull/137376
>From 0e647db27581991071d6e69abe977206806c5548 Mon Sep 17 00:00:00 2001
From: alex-t <atimofee at amd.com>
Date: Wed, 23 Apr 2025 22:13:15 +0200
Subject: [PATCH 1/4] [AMDGPU] Automatic conversion from wave32 to wave64
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp | 321 ++++++++++++++++++
llvm/lib/Target/AMDGPU/SIConvertWaveSize.h | 30 ++
.../AMDGPU/wave32-to-64-auto-convert.ll | 121 +++++++
7 files changed, 480 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
create mode 100644 llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
create mode 100644 llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4ff761ec19b3c..76ef87ba44913 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -51,6 +51,7 @@ FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsLegacyPass();
FunctionPass *createSIFormMemoryClausesLegacyPass();
+FunctionPass *createSIConvertWaveSizeLegacyPass(const TargetMachine *);
FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
@@ -174,6 +175,9 @@ extern char &SIShrinkInstructionsLegacyID;
void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &);
extern char &SIFixSGPRCopiesLegacyID;
+void initializeSIConvertWaveSizeLegacyPass(PassRegistry &);
+extern char &SIConvertWaveSizeLegacyID;
+
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 98a1147ef6d66..0cbd3ef8da761 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("si-convert-wave-size", SIConvertWaveSizePass(*static_cast<const GCNTargetMachine *>(this)))
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b6cc5137d711a..5be1640fd3db6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -44,6 +44,7 @@
#include "R600TargetMachine.h"
#include "SIFixSGPRCopies.h"
#include "SIFixVGPRCopies.h"
+#include "SIConvertWaveSize.h"
#include "SIFoldOperands.h"
#include "SIFormMemoryClauses.h"
#include "SILoadStoreOptimizer.h"
@@ -506,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsLegacyPass(*PR);
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
+ initializeSIConvertWaveSizeLegacyPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..663361face090 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -150,6 +150,7 @@ add_llvm_target(AMDGPUCodeGen
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp
+ SIConvertWaveSize.cpp
SIFoldOperands.cpp
SIFormMemoryClauses.cpp
SIFrameLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
new file mode 100644
index 0000000000000..4f5b839000c77
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
@@ -0,0 +1,321 @@
+//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64
+//---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+// Small short living kernels may become waveslot limited.
+// To work around the problem an optimization is proposed to convert such
+// kernels from wave32 to wave64 automatically.These kernels shall conform to a
+// strict set of limitations and satisfy profitability conditions.
+//
+// 1. A kernel shall have no function calls as we cannot analyze call stack
+// requirements (nor will it fall into a category of short living kernels
+// anyway).
+// 2. A kernel itself shall not be called from a device enqueue call.
+// 3. A kernel shall not attempt to access EXEC or VCC in any user visible
+// way.
+// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP
+// operations in general.
+// 5. A kernel shall not read wavefront size or use ballot through
+// intrinsics (a use of pre-defined frontend wave size macro was deemed
+// permissible for now).
+// 6. There shall be no atomic operations of any sort as these may be used
+// for cross-thread communication.
+// 7. There shall be no LDS access as the allocation is usually tied to the
+// workgroup size and we generally cannot extend it. It is also changing
+// occupancy which is tied to the wave size.
+// 8. There shall be no inline asm calls.
+// 9 .There shall be no dynamic VGPRs.
+// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and
+// transpose loads on GFX12+) work differently (have different operands) in
+// wave32 and wave64. The kernel shall not have intrinsics to invoke such
+// instructions.
+
+#include "SIConvertWaveSize.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-convert-wave-size"
+
+namespace {
+class SIConvertWaveSize {
+ const TargetMachine *TM;
+ const LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+
+ InstructionCost TotalCost = 0;
+
+ static const unsigned MaxLatency = 2000;
+
+ SmallVector<Function *> Callees;
+
+public:
+ SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI,
+ ScalarEvolution *SE, TargetTransformInfo *TTI)
+ : TM(TM), LI(LI), SE(SE), TTI(TTI) {}
+
+ bool run(Function &F);
+
+ bool changeWaveSizeAttr(Function *F);
+};
+
+class SIConvertWaveSizeLegacy : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID;
+ SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+ bool runOnFunction(Function &F) override {
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+ return Impl.run(F);
+ }
+ StringRef getPassName() const override { return "SI convert wave size"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.setPreservesAll();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // end anonymous namespace
+
+void printFunctionAttributes(const Function &F) {
+ LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+ for (const auto &Attr : F.getAttributes()) {
+ LLVM_DEBUG(dbgs() << " Attribute: " << Attr.getAsString() << "\n");
+ }
+}
+
+bool SIConvertWaveSize::run(Function &F) {
+ LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n");
+ LLVM_DEBUG(printFunctionAttributes(F));
+
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
+ return false;
+
+ // Check if the function is a kernel.
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ return false;
+
+ // Check if the kernel is wave32
+ if (F.hasFnAttribute("target-features")) {
+ if (!F.getFnAttribute("target-features")
+ .getValueAsString().contains("wavefrontsize32")) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n");
+ return false;
+ }
+ }
+
+ // Check if the function is a device enqueue call.
+ if (F.hasFnAttribute("amdgpu-device-enqueue")) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n");
+ return false;
+ }
+
+ // Check if a trip count is a compile time constant for all loops in the
+ // kernel
+ for (Loop *L : *LI) {
+ const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L);
+ if (!isa<SCEVConstant>(TripCountSCEV)) {
+ LLVM_DEBUG(
+ dbgs() << "SIConvertWaveSize: Trip count is not a compile time "
+ "constant.\n");
+ return false;
+ }
+ }
+
+ for (const auto &BB : F) {
+ InstructionCost BlockCost = 0;
+ for (const auto &I : BB) {
+ if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+ // FIXME: Any calls are not allowed. Only non-converged intrinsic clls
+ // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd
+ // separately for debug purposes. This will be changed in the final
+ // version.
+ if (CB->isInlineAsm()) {
+ // Inline assembly is not allowed.
+ LLVM_DEBUG(dbgs()
+ << "SIConvertWaveSize: Inline assembly detected.\n");
+ return false;
+ }
+ if (CB->isAtomic()) {
+ // Atomic operations are not allowed.
+ LLVM_DEBUG(dbgs()
+ << "SIConvertWaveSize: Atomic operation detected.\n");
+ return false;
+ }
+ if (Function *Callee = CB->getCalledFunction()) {
+ // assuming readlane/readfirstlane or any cross-lane/DPP
+ // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td
+ if (Callee->isIntrinsic()) {
+ if (Callee->hasFnAttribute(Attribute::Convergent)) {
+ if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) {
+ // TODO: what else should go in a "white list" ?
+ // Intrinsic::amdgcn_s_barrier_wavefront ?
+ // Intrinsic::amdgcn_s_barrier_signal ?
+ LLVM_DEBUG(dbgs()
+ << "SIConvertWaveSize: Convergent intrinsic "
+ << Callee->getName() << " detected.\n");
+ return false;
+ }
+ }
+
+ if (Callee->getIntrinsicID() == Intrinsic::read_register) {
+ if (const auto *MDVal =
+ dyn_cast<MetadataAsValue>(CB->getArgOperand(0))) {
+ Metadata *MD = MDVal->getMetadata();
+ if (auto *MDNodeVal = dyn_cast<MDNode>(MD)) {
+ if (MDNodeVal->getNumOperands() >= 1) {
+ if (auto *MDStr =
+ dyn_cast<MDString>(MDNodeVal->getOperand(0))) {
+ if (MDStr->getString().starts_with("exec") ||
+ MDStr->getString().starts_with("vcc")) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register("
+ << MDStr->getString()
+ << ") intrinsic detected.\n");
+ return false;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Save callee as a candidate for attribute change
+ Callees.push_back(Callee);
+ }
+ } else {
+ // General calls are not allowed.
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n");
+ return false;
+ }
+ }
+ // No LDS access is allowed
+ if (auto LI = dyn_cast<LoadInst>(&I)) {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
+ return false;
+ }
+ }
+ if (auto SI = dyn_cast<StoreInst>(&I)) {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
+ return false;
+ }
+ }
+ // TODO: All atomics are not allowed?
+ // if (auto AI = dyn_cast<AtomicRMWInst>(&I)) {
+ // if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ // LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access
+ // detected.\n"); return false;
+ // }
+ // }
+
+ // TODO: Dynamic VGPRS and GFX11+ special operations ???
+ BlockCost +=
+ TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ if (auto L = LI->getLoopFor(&BB)) {
+ const SCEV *TripCount = SE->getBackedgeTakenCount(L);
+ if (auto *C = dyn_cast<SCEVConstant>(TripCount)) {
+ uint64_t TC = C->getValue()->getZExtValue() + 1;
+ size_t Depth = LI->getLoopDepth(&BB);
+ BlockCost *= TC * Depth;
+ } else
+ llvm_unreachable("SIConvertWaveSize: only loops with compile time "
+ "constant trip count could reach here!\n");
+ }
+ TotalCost += BlockCost;
+ if (TotalCost.isValid()) {
+ if (TotalCost.getValue().value() >= MaxLatency) {
+ LLVM_DEBUG(
+ dbgs() << "SIConvertWaveSize: Total latency of the kernel ["
+ << TotalCost.getValue().value()
+ << "] exceeds the limit of 2000 cycles - not profitable!\n");
+ return false;
+ }
+ } else
+ llvm_unreachable(
+ "SIConvertWaveSize: Cost model error - invalid state!\n");
+ }
+
+ // Additional checks can be added here...
+
+ // If all checks pass, convert wave size from wave32 to wave64.
+ // Conversion logic goes here...
+ bool Changed = changeWaveSizeAttr(&F);
+ if (Changed)
+ // Now take care of the intrinsic calls
+ for (auto C : Callees) {
+ // TODO: if we could not change Attr for one of the callee
+ // we need to rollback all the changes!
+ changeWaveSizeAttr(C);
+ }
+
+ return Changed;
+ }
+
+bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) {
+ auto Attr = F->getFnAttribute("target-features");
+ if (Attr.isValid()) {
+ StringRef AttrStr = Attr.getValueAsString();
+ size_t Pos = AttrStr.find("+wavefrontsize32");
+ if (Pos != StringRef::npos) {
+ // Remove the "+wavefrontsize32" attribute.
+ std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64");
+ std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str();
+ std::string NewAttrStr = NewBegin + End;
+ // Add the "+wavefrontsize64" attribute.
+ F->removeFnAttr("target-features");
+ F->addFnAttr("target-features", NewAttrStr);
+ LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for "
+ << F->getName()
+ << " from wave32 "
+ "to wave64.\n");
+ return true;
+ }
+ }
+ return false;
+}
+
+INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
+ false, false)
+
+char SIConvertWaveSizeLegacy::ID = 0;
+
+char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID;
+
+FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) {
+ return new SIConvertWaveSizeLegacy(TM);
+}
+
+PreservedAnalyses SIConvertWaveSizePass::run(
+ Function &F, FunctionAnalysisManager &FAM) {
+ auto &LI = FAM.getResult<LoopAnalysis>(F);
+ auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+
+ SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+ bool Changed = Impl.run(F);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
new file mode 100644
index 0000000000000..78b8365ed9ebc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
@@ -0,0 +1,30 @@
+//===- SIConvertWaveSize.h ----------------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
+#define LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class SIConvertWaveSizePass : public PassInfoMixin<SIConvertWaveSizePass> {
+ /// The target machine.
+ const TargetMachine *TM;
+
+public:
+ SIConvertWaveSizePass(const TargetMachine &TM)
+ : TM(&TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
diff --git a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
new file mode 100644
index 0000000000000..d90e524e9cc2e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
@@ -0,0 +1,121 @@
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=si-convert-wave-size < %s | FileCheck %s
+
+define amdgpu_kernel void @test_not_wave32(ptr addrspace(1) %out) #0 {
+ ; CHECK: @test_not_wave32{{.*}}) #0
+ %gep = getelementptr i32, ptr addrspace(1) %out, i32 2
+ %tmp = load i32, ptr addrspace(1) %gep
+ store i32 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @intr_non_convergent(ptr addrspace(1) nocapture %arg) #1 {
+ ; CHECK: @intr_non_convergent{{.*}} #0
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+ %tmp1 = icmp ugt i32 %tmp, 32
+ %tmp2 = select i1 %tmp1, i32 2, i32 1
+ store i32 %tmp2, ptr addrspace(1) %arg
+ ret void
+}
+
+define amdgpu_kernel void @intr_convergent(ptr addrspace(1) nocapture %arg, i32 %X) #1 {
+ ; CHECK: @intr_convergent{{.*}}) #1
+bb:
+ %tmp = icmp ugt i32 %X, 32
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %tmp)
+ store i32 %ballot, ptr addrspace(1) %arg
+ ret void
+}
+
+define amdgpu_kernel void @test_barrier(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 {
+ ; CHECK: @test_barrier{{.*}}) #0
+entry:
+ %val = load <2 x half>, ptr addrspace(1) %in
+ call void @llvm.amdgcn.s.barrier() #2
+ store <2 x half> %val, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @test_read_exec(ptr addrspace(1) %out) #1 {
+ ; CHECK: @test_read_exec{{.*}}) #1
+ %exec = call i64 @llvm.read_register.i64(metadata !0)
+ store i64 %exec, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_read_vcc_lo(ptr addrspace(1) %out) #1 {
+ ; CHECK: @test_read_vcc_lo{{.*}}) #1
+ %vcc_lo = call i32 @llvm.read_register.i32(metadata !1)
+ store i32 %vcc_lo, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_read_vcc_hi(ptr addrspace(1) %out) #1 {
+ ; CHECK: @test_read_vcc_hi{{.*}}) #1
+ %vcc_hi = call i32 @llvm.read_register.i32(metadata !2)
+ store i32 %vcc_hi, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_lds_access(ptr addrspace(3) %out) #1 {
+ ; CHECK: @test_lds_access{{.*}}) #1
+ %gep = getelementptr i32, ptr addrspace(3) %out, i32 2
+ %tmp = load i32, ptr addrspace(3) %gep
+ store i32 %tmp, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_simple_loop(ptr addrspace(1) nocapture %arg) #1 {
+ ; CHECK: @test_simple_loop{{.*}}) #1
+bb:
+ br label %bb2
+
+bb1:
+ ret void
+
+bb2:
+ %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
+ %tmp2 = add nuw nsw i32 %tmp1, 1
+ %tmp3 = icmp eq i32 %tmp2, 1024
+ tail call void @llvm.amdgcn.s.sleep(i32 0)
+ br i1 %tmp3, label %bb1, label %bb2
+}
+
+define amdgpu_kernel void @test_nested_loop(ptr addrspace(1) nocapture %arg) #1 {
+ ; CHECK: @test_nested_loop{{.*}}) #1
+bb:
+ br label %bb2
+
+bb1:
+ ret void
+
+bb2:
+ %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ]
+ %tmp2 = add nuw nsw i32 %tmp1, 1
+ %tmp3 = icmp eq i32 %tmp2, 8
+ br label %bb3
+
+bb3:
+ %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ]
+ %tmp5 = add nuw nsw i32 %tmp4, 1
+ %tmp6 = icmp eq i32 %tmp5, 128
+ tail call void @llvm.amdgcn.s.sleep(i32 0)
+ br i1 %tmp6, label %bb4, label %bb3
+
+bb4:
+ br i1 %tmp3, label %bb1, label %bb2
+}
+
+declare void @llvm.amdgcn.s.sleep(i32)
+declare i32 @llvm.amdgcn.wavefrontsize()
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i32 @llvm.read_register.i32(metadata)
+declare i64 @llvm.read_register.i64(metadata)
+
+attributes #0 = { nounwind "target-features"="+wavefrontsize64" }
+attributes #1 = { nounwind "target-features"="+wavefrontsize32" }
+
+!0 = !{!"exec"}
+!1 = !{!"vcc_lo"}
+!2 = !{!"vcc_hi"}
>From 4bc81331860618d77926a599a5d973eda2ece1a9 Mon Sep 17 00:00:00 2001
From: alex-t <atimofee at amd.com>
Date: Wed, 7 May 2025 21:01:21 +0200
Subject: [PATCH 2/4] [AMDGPU] Automatic conversion from wave32 to wave64.
Review issues addressed.
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +-
...WaveSize.cpp => AMDGPUConvertWaveSize.cpp} | 256 ++++++++++++------
...vertWaveSize.h => AMDGPUConvertWaveSize.h} | 13 +-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 2 +-
6 files changed, 183 insertions(+), 98 deletions(-)
rename llvm/lib/Target/AMDGPU/{SIConvertWaveSize.cpp => AMDGPUConvertWaveSize.cpp} (50%)
rename llvm/lib/Target/AMDGPU/{SIConvertWaveSize.h => AMDGPUConvertWaveSize.h} (65%)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 76ef87ba44913..7e585d2698564 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -51,7 +51,9 @@ FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsLegacyPass();
FunctionPass *createSIFormMemoryClausesLegacyPass();
-FunctionPass *createSIConvertWaveSizeLegacyPass(const TargetMachine *);
+FunctionPass *createAMDGPUConvertWaveSizeLegacyPass(const GCNTargetMachine *);
+void initializeAMDGPUConvertWaveSizeLegacyPass(PassRegistry &);
+extern char &AMDGPUConvertWaveSizeLegacyID;
FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp
similarity index 50%
rename from llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
rename to llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp
index 4f5b839000c77..c166def577558 100644
--- a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp
@@ -1,11 +1,12 @@
-//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64
-//---------===//
+//===- SIConvertWaveSize.cpp ----------------------------------------------===//
+//
+// Automatically converts wave32 kernels to wave64
//
// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
//
/// \file
// Small short living kernels may become waveslot limited.
@@ -36,7 +37,7 @@
// wave32 and wave64. The kernel shall not have intrinsics to invoke such
// instructions.
-#include "SIConvertWaveSize.h"
+#include "AMDGPUConvertWaveSize.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -49,8 +50,8 @@ using namespace llvm;
#define DEBUG_TYPE "si-convert-wave-size"
namespace {
-class SIConvertWaveSize {
- const TargetMachine *TM;
+class AMDGPUConvertWaveSize {
+ const GCNTargetMachine *TM;
const LoopInfo *LI;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
@@ -62,8 +63,8 @@ class SIConvertWaveSize {
SmallVector<Function *> Callees;
public:
- SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI,
- ScalarEvolution *SE, TargetTransformInfo *TTI)
+ AMDGPUConvertWaveSize(const GCNTargetMachine *TM, const LoopInfo *LI,
+ ScalarEvolution *SE, TargetTransformInfo *TTI)
: TM(TM), LI(LI), SE(SE), TTI(TTI) {}
bool run(Function &F);
@@ -71,20 +72,20 @@ class SIConvertWaveSize {
bool changeWaveSizeAttr(Function *F);
};
-class SIConvertWaveSizeLegacy : public FunctionPass {
- const TargetMachine *TM;
+class AMDGPUConvertWaveSizeLegacy : public FunctionPass {
+ const GCNTargetMachine *TM;
public:
static char ID;
- SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+ AMDGPUConvertWaveSizeLegacy(const GCNTargetMachine *TM) : FunctionPass(ID), TM(TM) {}
bool runOnFunction(Function &F) override {
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+ AMDGPUConvertWaveSize Impl(TM, &LI, &SE, &TTI);
return Impl.run(F);
}
- StringRef getPassName() const override { return "SI convert wave size"; }
+ StringRef getPassName() const override { return "AMDGPU convert wave size"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
@@ -101,30 +102,44 @@ void printFunctionAttributes(const Function &F) {
}
}
-bool SIConvertWaveSize::run(Function &F) {
- LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n");
- LLVM_DEBUG(printFunctionAttributes(F));
-
- const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
- if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
- return false;
+bool AMDGPUConvertWaveSize::run(Function &F) {
// Check if the function is a kernel.
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
return false;
- // Check if the kernel is wave32
- if (F.hasFnAttribute("target-features")) {
- if (!F.getFnAttribute("target-features")
- .getValueAsString().contains("wavefrontsize32")) {
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n");
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ if (!ST.isWave32()) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel is not wave32.\n");
+ return false;
+ }
+
+ for (const auto &Arg : F.args()) {
+ if (Arg.getType()->isPointerTy() &&
+ Arg.getType()->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel argument " << Arg
+ << " points to LDS object\n");
return false;
}
}
- // Check if the function is a device enqueue call.
- if (F.hasFnAttribute("amdgpu-device-enqueue")) {
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n");
+ // Check if the function can be called via device enqueue.
+ bool addressEscapes = false;
+ if (!F.use_empty()) {
+ const Module *M = F.getParent();
+ for (const GlobalVariable &GV : M->globals()) {
+ if (GV.hasInitializer()) {
+ if (const Constant *Init = GV.getInitializer()) {
+ if (isa<Function>(Init) && Init == &F) {
+ addressEscapes = true;
+ }
+ }
+ }
+ }
+ }
+
+ if (addressEscapes) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel address is taken.\n");
return false;
}
@@ -134,7 +149,7 @@ bool SIConvertWaveSize::run(Function &F) {
const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L);
if (!isa<SCEVConstant>(TripCountSCEV)) {
LLVM_DEBUG(
- dbgs() << "SIConvertWaveSize: Trip count is not a compile time "
+ dbgs() << "AMDGPUConvertWaveSize: Trip count is not a compile time "
"constant.\n");
return false;
}
@@ -143,23 +158,25 @@ bool SIConvertWaveSize::run(Function &F) {
for (const auto &BB : F) {
InstructionCost BlockCost = 0;
for (const auto &I : BB) {
+
+ // Atomic operations are not allowed.
+ if (I.isAtomic()) {
+ LLVM_DEBUG(
+ dbgs() << "AMDGPUConvertWaveSize: Atomic operation detected.\n");
+ return false;
+ }
+
if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
- // FIXME: Any calls are not allowed. Only non-converged intrinsic clls
- // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd
- // separately for debug purposes. This will be changed in the final
- // version.
+ // FIXME: Any calls are not allowed. Only non-converged intrinsic calls
+ // and amdgsn_s_barrier are exempt. InlineAsm is checked separately
+ // for debug purposes. This will be changed in the final version.
if (CB->isInlineAsm()) {
// Inline assembly is not allowed.
LLVM_DEBUG(dbgs()
- << "SIConvertWaveSize: Inline assembly detected.\n");
- return false;
- }
- if (CB->isAtomic()) {
- // Atomic operations are not allowed.
- LLVM_DEBUG(dbgs()
- << "SIConvertWaveSize: Atomic operation detected.\n");
+ << "AMDGPUConvertWaveSize: Inline assembly detected.\n");
return false;
}
+
if (Function *Callee = CB->getCalledFunction()) {
// assuming readlane/readfirstlane or any cross-lane/DPP
// operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td
@@ -170,66 +187,131 @@ bool SIConvertWaveSize::run(Function &F) {
// Intrinsic::amdgcn_s_barrier_wavefront ?
// Intrinsic::amdgcn_s_barrier_signal ?
LLVM_DEBUG(dbgs()
- << "SIConvertWaveSize: Convergent intrinsic "
+ << "AMDGPUConvertWaveSize: Convergent intrinsic "
<< Callee->getName() << " detected.\n");
return false;
}
}
- if (Callee->getIntrinsicID() == Intrinsic::read_register) {
- if (const auto *MDVal =
- dyn_cast<MetadataAsValue>(CB->getArgOperand(0))) {
- Metadata *MD = MDVal->getMetadata();
- if (auto *MDNodeVal = dyn_cast<MDNode>(MD)) {
- if (MDNodeVal->getNumOperands() >= 1) {
- if (auto *MDStr =
- dyn_cast<MDString>(MDNodeVal->getOperand(0))) {
- if (MDStr->getString().starts_with("exec") ||
- MDStr->getString().starts_with("vcc")) {
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register("
- << MDStr->getString()
- << ") intrinsic detected.\n");
- return false;
- }
- }
- }
+ if (Callee->getIntrinsicID() == Intrinsic::read_register ||
+ Callee->getIntrinsicID() == Intrinsic::write_register) {
+
+ LLVM_DEBUG(dbgs()
+ << "AMDGPUConvertWaveSize: read/write_register "
+ "intrinsic detected.\n");
+ return false;
+ }
+
+ // Take care of LDS access
+ if (const auto *MTI = dyn_cast<MemTransferInst>(&I)) {
+ auto DstAS = MTI->getDestAddressSpace();
+ auto SrcAS = MTI->getSourceAddressSpace();
+ if (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access "
+ "(llvm.memcpy/memmove) detected.\n");
+ return false;
+ }
+ } else if (const auto *MSI = dyn_cast<MemSetInst>(&I)) {
+ auto DstAS = MSI->getDestAddressSpace();
+ if (DstAS == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access "
+ "(llvm.memset) detected.\n");
+ return false;
+ }
+ } else if (const auto AMCI = dyn_cast<AtomicMemCpyInst>(&I)) {
+ auto DstAS = AMCI->getDestAddressSpace();
+ auto SrcAS = AMCI->getSourceAddressSpace();
+ if (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(
+ dbgs()
+ << "AMDGPUConvertWaveSize: LDS access "
+ "(llvm.memcpy.element.unordered.atomic) detected\n");
+ return false;
+ }
+ } else
+ if (const auto AMMI = dyn_cast<AtomicMemMoveInst>(&I)) {
+ auto DstAS = AMMI->getDestAddressSpace();
+ auto SrcAS = AMMI->getSourceAddressSpace();
+ if (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(
+ dbgs()
+ << "AMDGPUConvertWaveSize: LDS access "
+ "(llvm.memmove.element.unordered.atomic) detected.\n");
+ return false;
+ }
+ } else if (const auto *AMSI = dyn_cast<AtomicMemSetInst>(&I)) {
+ auto DstAS = AMSI->getDestAddressSpace();
+ if (DstAS == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(
+ dbgs()
+ << "AMDGPUConvertWaveSize: LDS access "
+ "(llvm.memset.element.unordered.atomic) detected.\n");
+ return false;
}
}
- }
// Save callee as a candidate for attribute change
Callees.push_back(Callee);
}
} else {
// General calls are not allowed.
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n");
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: function call detected.\n");
return false;
}
}
// No LDS access is allowed
- if (auto LI = dyn_cast<LoadInst>(&I)) {
+
+ // We already ensured we have no LDS pointers passed as arguments.
+ // Now take care of those cast from flat or global
+
+ // Bail out early, before we come across the LDS addres use.
+ if (const auto AC = dyn_cast<AddrSpaceCastInst>(&I)) {
+ if (AC->getDestTy()->getPointerAddressSpace() ==
+ AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
+ return false;
+ }
+ }
+
+ if (const auto I2P = dyn_cast<IntToPtrInst>(&I)) {
+ if (I2P->getDestTy()->isPointerTy() &&
+ I2P->getDestTy()->getPointerAddressSpace() ==
+ AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
+ return false;
+ }
+ }
+
+ // GEP may refer to the global LDS object
+ if (const auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (GEP->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
+ return false;
+ }
+ }
+
+ // Load/Store/Atomics may directly use global LDS object
+ if (const auto LI = dyn_cast<LoadInst>(&I)) {
if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
return false;
}
}
- if (auto SI = dyn_cast<StoreInst>(&I)) {
+ if (const auto SI = dyn_cast<StoreInst>(&I)) {
if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
return false;
}
}
- // TODO: All atomics are not allowed?
- // if (auto AI = dyn_cast<AtomicRMWInst>(&I)) {
- // if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- // LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access
- // detected.\n"); return false;
- // }
- // }
+
+ if (const auto MemIntr = dyn_cast<MemIntrinsic>(&I))
// TODO: Dynamic VGPRS and GFX11+ special operations ???
BlockCost +=
- TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput);
+ TTI->getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
}
if (auto L = LI->getLoopFor(&BB)) {
const SCEV *TripCount = SE->getBackedgeTakenCount(L);
@@ -238,21 +320,21 @@ bool SIConvertWaveSize::run(Function &F) {
size_t Depth = LI->getLoopDepth(&BB);
BlockCost *= TC * Depth;
} else
- llvm_unreachable("SIConvertWaveSize: only loops with compile time "
+ llvm_unreachable("AMDGPUConvertWaveSize: only loops with compile time "
"constant trip count could reach here!\n");
}
TotalCost += BlockCost;
if (TotalCost.isValid()) {
if (TotalCost.getValue().value() >= MaxLatency) {
LLVM_DEBUG(
- dbgs() << "SIConvertWaveSize: Total latency of the kernel ["
+ dbgs() << "AMDGPUConvertWaveSize: Total latency of the kernel ["
<< TotalCost.getValue().value()
<< "] exceeds the limit of 2000 cycles - not profitable!\n");
return false;
}
} else
llvm_unreachable(
- "SIConvertWaveSize: Cost model error - invalid state!\n");
+ "AMDGPUConvertWaveSize: Cost model error - invalid state!\n");
}
// Additional checks can be added here...
@@ -269,9 +351,9 @@ bool SIConvertWaveSize::run(Function &F) {
}
return Changed;
- }
+}
-bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) {
+bool AMDGPUConvertWaveSize::changeWaveSizeAttr(Function *F) {
auto Attr = F->getFnAttribute("target-features");
if (Attr.isValid()) {
StringRef AttrStr = Attr.getValueAsString();
@@ -284,7 +366,7 @@ bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) {
// Add the "+wavefrontsize64" attribute.
F->removeFnAttr("target-features");
F->addFnAttr("target-features", NewAttrStr);
- LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for "
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for "
<< F->getName()
<< " from wave32 "
"to wave64.\n");
@@ -294,28 +376,28 @@ bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) {
return false;
}
-INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
+INITIALIZE_PASS_BEGIN(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size",
false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
+INITIALIZE_PASS_END(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size",
false, false)
-char SIConvertWaveSizeLegacy::ID = 0;
+char AMDGPUConvertWaveSizeLegacy::ID = 0;
-char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID;
+char &llvm::AMDGPUConvertWaveSizeLegacyID = AMDGPUConvertWaveSizeLegacy::ID;
-FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) {
- return new SIConvertWaveSizeLegacy(TM);
+FunctionPass *llvm::createAMDGPUConvertWaveSizeLegacyPass(const GCNTargetMachine *TM) {
+ return new AMDGPUConvertWaveSizeLegacy(TM);
}
-PreservedAnalyses SIConvertWaveSizePass::run(
+PreservedAnalyses AMDGPUConvertWaveSizePass::run(
Function &F, FunctionAnalysisManager &FAM) {
- auto &LI = FAM.getResult<LoopAnalysis>(F);
+ auto &LI = FAM.getResult<LoopAnalysis>(F);
auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
- SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+ AMDGPUConvertWaveSize Impl(TM, &LI, &SE, &TTI);
bool Changed = Impl.run(F);
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h
similarity index 65%
rename from llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
rename to llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h
index 78b8365ed9ebc..e5b8c92c0b656 100644
--- a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h
@@ -5,9 +5,10 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
-#define LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H
+#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -15,16 +16,16 @@
namespace llvm {
-class SIConvertWaveSizePass : public PassInfoMixin<SIConvertWaveSizePass> {
+class AMDGPUConvertWaveSizePass : public PassInfoMixin<AMDGPUConvertWaveSizePass> {
/// The target machine.
- const TargetMachine *TM;
+ const GCNTargetMachine *TM;
public:
- SIConvertWaveSizePass(const TargetMachine &TM)
+ AMDGPUConvertWaveSizePass(const GCNTargetMachine &TM)
: TM(&TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
};
} // namespace llvm
-#endif // LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 0cbd3ef8da761..b953ba8e77599 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,7 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
-FUNCTION_PASS("si-convert-wave-size", SIConvertWaveSizePass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-convert-wave-size", AMDGPUConvertWaveSizePass(*static_cast<const GCNTargetMachine *>(this)))
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5be1640fd3db6..f2e7adebf2786 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -44,7 +44,7 @@
#include "R600TargetMachine.h"
#include "SIFixSGPRCopies.h"
#include "SIFixVGPRCopies.h"
-#include "SIConvertWaveSize.h"
+#include "AMDGPUConvertWaveSize.h"
#include "SIFoldOperands.h"
#include "SIFormMemoryClauses.h"
#include "SILoadStoreOptimizer.h"
@@ -507,7 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsLegacyPass(*PR);
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
- initializeSIConvertWaveSizeLegacyPass(*PR);
+ initializeAMDGPUConvertWaveSizeLegacyPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 663361face090..fccdd47151593 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -150,7 +150,7 @@ add_llvm_target(AMDGPUCodeGen
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp
- SIConvertWaveSize.cpp
+ AMDGPUConvertWaveSize.cpp
SIFoldOperands.cpp
SIFormMemoryClauses.cpp
SIFrameLowering.cpp
>From beebaa2c99f088f50b7c5129997e46daf028ae3c Mon Sep 17 00:00:00 2001
From: alex-t <atimofee at amd.com>
Date: Wed, 7 May 2025 21:26:29 +0200
Subject: [PATCH 3/4] [AMDGPU] Automatic conversion from wave32 to wave64.
Initialize method name fixed
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 7e585d2698564..06aa8c4ad06b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -177,8 +177,8 @@ extern char &SIShrinkInstructionsLegacyID;
void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &);
extern char &SIFixSGPRCopiesLegacyID;
-void initializeSIConvertWaveSizeLegacyPass(PassRegistry &);
-extern char &SIConvertWaveSizeLegacyID;
+void initializeAMDGPUConvertWaveSizeLegacyPass(PassRegistry &);
+extern char &AMDGPUConvertWaveSizeLegacyID;
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
>From 04a4147bbb97166c805f48c5bf2faac7648def3f Mon Sep 17 00:00:00 2001
From: alex-t <atimofee at amd.com>
Date: Thu, 8 May 2025 22:05:39 +0200
Subject: [PATCH 4/4] [AMDGPU] Automatic conversion from wave32 to wave64. --
Simplified wavefrontsize attribute update -- LDS access checks changed --
new LDS test in LIT NOTE: because of the simplified attribute update need to
change all the checks. Most likely the direct checks of attributes is now
impossible.
---
.../Target/AMDGPU/AMDGPUConvertWaveSize.cpp | 159 ++++--------------
.../AMDGPU/wave32-to-64-auto-convert.ll | 29 +++-
2 files changed, 62 insertions(+), 126 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp
index c166def577558..1ee86c437610d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp
@@ -47,7 +47,7 @@
using namespace llvm;
-#define DEBUG_TYPE "si-convert-wave-size"
+#define DEBUG_TYPE "amdgpu-convert-wave-size"
namespace {
class AMDGPUConvertWaveSize {
@@ -68,8 +68,6 @@ class AMDGPUConvertWaveSize {
: TM(TM), LI(LI), SE(SE), TTI(TTI) {}
bool run(Function &F);
-
- bool changeWaveSizeAttr(Function *F);
};
class AMDGPUConvertWaveSizeLegacy : public FunctionPass {
@@ -123,22 +121,25 @@ bool AMDGPUConvertWaveSize::run(Function &F) {
}
}
- // Check if the function can be called via device enqueue.
- bool addressEscapes = false;
- if (!F.use_empty()) {
- const Module *M = F.getParent();
- for (const GlobalVariable &GV : M->globals()) {
- if (GV.hasInitializer()) {
- if (const Constant *Init = GV.getInitializer()) {
- if (isa<Function>(Init) && Init == &F) {
- addressEscapes = true;
- }
+ // Check for static LDS uses
+ const Module *M = F.getParent();
+ for (const GlobalVariable &GV : M->globals()) {
+ if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+
+ for (auto User : GV.users()) {
+ if (auto UseI = dyn_cast<Instruction>(User)) {
+ if (UseI->getFunction() == &F) {
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Global variable " << GV
+ << " points to LDS object and is used\n");
+ return false;
}
}
}
}
- if (addressEscapes) {
+ // Check if the kernel can be called via device enqueue.
+ if (F.hasAddressTaken()) {
LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel address is taken.\n");
return false;
}
@@ -184,8 +185,6 @@ bool AMDGPUConvertWaveSize::run(Function &F) {
if (Callee->hasFnAttribute(Attribute::Convergent)) {
if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) {
// TODO: what else should go in a "white list" ?
- // Intrinsic::amdgcn_s_barrier_wavefront ?
- // Intrinsic::amdgcn_s_barrier_signal ?
LLVM_DEBUG(dbgs()
<< "AMDGPUConvertWaveSize: Convergent intrinsic "
<< Callee->getName() << " detected.\n");
@@ -202,57 +201,6 @@ bool AMDGPUConvertWaveSize::run(Function &F) {
return false;
}
- // Take care of LDS access
- if (const auto *MTI = dyn_cast<MemTransferInst>(&I)) {
- auto DstAS = MTI->getDestAddressSpace();
- auto SrcAS = MTI->getSourceAddressSpace();
- if (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access "
- "(llvm.memcpy/memmove) detected.\n");
- return false;
- }
- } else if (const auto *MSI = dyn_cast<MemSetInst>(&I)) {
- auto DstAS = MSI->getDestAddressSpace();
- if (DstAS == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access "
- "(llvm.memset) detected.\n");
- return false;
- }
- } else if (const auto AMCI = dyn_cast<AtomicMemCpyInst>(&I)) {
- auto DstAS = AMCI->getDestAddressSpace();
- auto SrcAS = AMCI->getSourceAddressSpace();
- if (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(
- dbgs()
- << "AMDGPUConvertWaveSize: LDS access "
- "(llvm.memcpy.element.unordered.atomic) detected\n");
- return false;
- }
- } else
- if (const auto AMMI = dyn_cast<AtomicMemMoveInst>(&I)) {
- auto DstAS = AMMI->getDestAddressSpace();
- auto SrcAS = AMMI->getSourceAddressSpace();
- if (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(
- dbgs()
- << "AMDGPUConvertWaveSize: LDS access "
- "(llvm.memmove.element.unordered.atomic) detected.\n");
- return false;
- }
- } else if (const auto *AMSI = dyn_cast<AtomicMemSetInst>(&I)) {
- auto DstAS = AMSI->getDestAddressSpace();
- if (DstAS == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(
- dbgs()
- << "AMDGPUConvertWaveSize: LDS access "
- "(llvm.memset.element.unordered.atomic) detected.\n");
- return false;
- }
- }
-
// Save callee as a candidate for attribute change
Callees.push_back(Callee);
}
@@ -271,7 +219,9 @@ bool AMDGPUConvertWaveSize::run(Function &F) {
if (const auto AC = dyn_cast<AddrSpaceCastInst>(&I)) {
if (AC->getDestTy()->getPointerAddressSpace() ==
AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "AMDGPUConvertWaveSize: addrspacecast to LDS detected.\n");
return false;
}
}
@@ -280,36 +230,14 @@ bool AMDGPUConvertWaveSize::run(Function &F) {
if (I2P->getDestTy()->isPointerTy() &&
I2P->getDestTy()->getPointerAddressSpace() ==
AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: convertion int to LDS "
+ "pointer detected.\n");
return false;
}
}
- // GEP may refer to the global LDS object
- if (const auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
- if (GEP->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
- return false;
- }
- }
-
- // Load/Store/Atomics may directly use global LDS object
- if (const auto LI = dyn_cast<LoadInst>(&I)) {
- if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
- return false;
- }
- }
- if (const auto SI = dyn_cast<StoreInst>(&I)) {
- if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n");
- return false;
- }
- }
-
- if (const auto MemIntr = dyn_cast<MemIntrinsic>(&I))
-
// TODO: Dynamic VGPRS and GFX11+ special operations ???
+
BlockCost +=
TTI->getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
}
@@ -340,41 +268,22 @@ bool AMDGPUConvertWaveSize::run(Function &F) {
// Additional checks can be added here...
// If all checks pass, convert wave size from wave32 to wave64.
- // Conversion logic goes here...
- bool Changed = changeWaveSizeAttr(&F);
- if (Changed)
- // Now take care of the intrinsic calls
- for (auto C : Callees) {
- // TODO: if we could not change Attr for one of the callee
- // we need to rollback all the changes!
- changeWaveSizeAttr(C);
- }
+ F.addFnAttr("target-features", "+wavefrontsize64");
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for "
+ << F.getName() << " from wave32 to wave64.\n");
+ // Now take care of the intrinsic calls
+ for (auto C : Callees) {
+ C->addFnAttr("target-features", "+wavefrontsize64");
+ LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for "
+ << C->getName() << " from wave32 to wave64.\n");
+ }
- return Changed;
+ return true;
}
-bool AMDGPUConvertWaveSize::changeWaveSizeAttr(Function *F) {
- auto Attr = F->getFnAttribute("target-features");
- if (Attr.isValid()) {
- StringRef AttrStr = Attr.getValueAsString();
- size_t Pos = AttrStr.find("+wavefrontsize32");
- if (Pos != StringRef::npos) {
- // Remove the "+wavefrontsize32" attribute.
- std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64");
- std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str();
- std::string NewAttrStr = NewBegin + End;
- // Add the "+wavefrontsize64" attribute.
- F->removeFnAttr("target-features");
- F->addFnAttr("target-features", NewAttrStr);
- LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for "
- << F->getName()
- << " from wave32 "
- "to wave64.\n");
- return true;
- }
- }
- return false;
-}
+//===----------------------------------------------------------------------===//
+// Pass registration
+//===----------------------------------------------------------------------===//
INITIALIZE_PASS_BEGIN(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size",
false, false)
diff --git a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
index d90e524e9cc2e..f43dc3235a05d 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=si-convert-wave-size < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-convert-wave-size < %s | FileCheck %s
define amdgpu_kernel void @test_not_wave32(ptr addrspace(1) %out) #0 {
; CHECK: @test_not_wave32{{.*}}) #0
@@ -66,6 +66,33 @@ define amdgpu_kernel void @test_lds_access(ptr addrspace(3) %out) #1 {
ret void
}
+define amdgpu_kernel void @test_addrspacecast_to_lds(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %in, i32 16
+ %ptr = addrspacecast ptr addrspace(1) %gep to ptr addrspace(3)
+ %val = load i32, ptr addrspace(3) %ptr
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_bitcast_to_lds_ptr(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %in, i32 16
+ %lds = inttoptr i32 0 to ptr addrspace(3)
+ %val = load i32, ptr addrspace(3) %lds
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+ at lds = addrspace(3) global [256 x i32] zeroinitializer
+
+define amdgpu_kernel void @test_use_global_lds_object(ptr addrspace(1) %out, i1 %p) #0 {
+ %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10
+ %ld = load i32, ptr addrspace(3) %gep
+ store i32 %ld, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @test_simple_loop(ptr addrspace(1) nocapture %arg) #1 {
; CHECK: @test_simple_loop{{.*}}) #1
bb:
More information about the llvm-commits
mailing list