[clang] [llvm] [llvm][AMDGPU] Fold `llvm.amdgcn.wavefrontsize` early (PR #114481)

Sun Nov 24 14:07:28 PST 2024

https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/114481

>From 3ba88ce598aaab269169f0a5db5981c9a9ac8603 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 31 Oct 2024 22:38:36 +0000
Subject: [PATCH 01/11] Add pass to handle AMDGCN pseudo-intrinsics (abstract
 placeholders for target specific info), and add handling for
 `llvm.amdgcn.wavefrontsize`.

---
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |  5 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  9 ++
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp   | 49 +++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  2 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  3 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  1 +
 .../AMDGPU/llvm.amdgcn.wavefrontsize.ll       | 99 ++++++++++++++-----
 7 files changed, 139 insertions(+), 29 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bf5f2971cf118c..de6a06dad6a08d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
 
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
 // CHECK-LABEL test_wavefrontsize(
 unsigned test_wavefrontsize() {
 
-  // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+  // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+  // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
   return __builtin_amdgcn_wavefrontsize();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..17d3e6ab7c65ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,6 +345,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
+struct AMDGPUExpandPseudoIntrinsicsPass
+    : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
+  const AMDGPUTargetMachine &TM;
+  AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 00000000000000..faa23bb8550dbc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,49 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
+                                                        Function *WaveSize) {
+  if (WaveSize->hasZeroLiveUses())
+    return PreservedAnalyses::all();
+
+  for (auto &&U : WaveSize->users())
+    U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
+                                           ST.getWavefrontSize()));
+
+  return PreservedAnalyses::none();
+}
+
+PreservedAnalyses
+  AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+
+  if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
+    return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..323c195c329168 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint",
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
+            AMDGPUExpandPseudoIntrinsicsPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d93ec34a703d3d..2bf8df6588c59c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #include "llvm/Passes/TargetPassRegistry.inc"
 
   PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
         FunctionPassManager FPM;
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         if (EnableHipStdPar)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..c9d4452b4a035c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUExpandPseudoIntrinsics.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 824d3708c027db..efa53def5ee686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
@@ -5,28 +6,43 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
 ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
 
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -34,20 +50,35 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT:   store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
@@ -57,15 +88,31 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT:       bb:
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   bb3:
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32

>From 826c291f59f05cb7065dceb6052f3d8b7bf33f57 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 01:01:19 +0000
Subject: [PATCH 02/11] Implement review feedback.

---
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp      | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index faa23bb8550dbc..b46097bbd33e99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -22,6 +22,7 @@
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 
@@ -33,17 +34,22 @@ static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
     return PreservedAnalyses::all();
 
   for (auto &&U : WaveSize->users())
-    U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
-                                           ST.getWavefrontSize()));
+    U->replaceAllUsesWith(
+        ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize()));
 
   return PreservedAnalyses::none();
 }
 
 PreservedAnalyses
-  AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
 
-  if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
-    return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
+  if (auto WS =
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
+    return expandWaveSizeIntrinsic(ST, WS);
 
   return PreservedAnalyses::all();
 }

>From ab6f5a22a2442468f2ef0a7f18239f858b6320b7 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 02:02:08 +0000
Subject: [PATCH 03/11] Do not fold early for `generic` mcpu.

---
 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++++
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll   | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index b46097bbd33e99..fb2ef7b7ed2d71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -47,6 +47,10 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
 
   const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
 
+  // This is not a concrete target, we should not fold early.
+  if (ST.getCPU().empty() || ST.getCPU() == "generic")
+    return PreservedAnalyses::all();
+
   if (auto WS =
       Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
     return expandWaveSizeIntrinsic(ST, WS);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index efa53def5ee686..2d060fd4305077 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -6,10 +6,10 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
 ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s

>From f8705fbe9f9c78148ca0a0360caf2650ab546185 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 1 Nov 2024 02:07:03 +0000
Subject: [PATCH 04/11] Fix formatting (again).

---
 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
index fb2ef7b7ed2d71..bf0ec39ab6c6e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -51,8 +51,8 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
   if (ST.getCPU().empty() || ST.getCPU() == "generic")
     return PreservedAnalyses::all();
 
-  if (auto WS =
-      Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize))
+  if (auto WS = Intrinsic::getDeclarationIfExists(
+          &M, Intrinsic::amdgcn_wavefrontsize))
     return expandWaveSizeIntrinsic(ST, WS);
 
   return PreservedAnalyses::all();

>From 026ed0092adf5c8a8b08b1772338c08ed501b54a Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:27:45 +0200
Subject: [PATCH 05/11] Remove pass, fold in InstCombine.

---
 llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 9 +++++++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp        | 5 +----
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 6 +++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a0..d952103aa81fdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_wavefrontsize: {
+    // TODO: this is a workaround for the pseudo-generic target one gets with no
+    // specified mcpu, which spoofs its wave size to 64; it should be removed.
+    if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+        !ST->getFeatureString().contains("+wavefrontsize"))
+      break;
+    return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(),
+                                                       ST->getWavefrontSize()));
+  }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.
     if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 851e0b25ad1625..86d8dbe4d803cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -744,10 +744,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #include "llvm/Passes/TargetPassRegistry.inc"
 
   PB.registerPipelineStartEPCallback(
-      [this](ModulePassManager &PM, OptimizationLevel Level) {
-        PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
-        FunctionPassManager FPM;
-        PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+      [](ModulePassManager &PM, OptimizationLevel Level) {
         if (EnableHipStdPar)
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
       });
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 2d060fd4305077..f1aed3dc00c100 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -7,9 +7,9 @@
 
 ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s

>From 195decc90bbdc1996d04bdf0ef4fe18f0d1953c2 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:53:46 +0200
Subject: [PATCH 06/11] Remove leftovers.

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               | 9 ---------
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 --
 llvm/lib/Target/AMDGPU/CMakeLists.txt         | 1 -
 3 files changed, 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 17d3e6ab7c65ab..95d0ad0f9dc96a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,15 +345,6 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
-struct AMDGPUExpandPseudoIntrinsicsPass
-    : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
-  const AMDGPUTargetMachine &TM;
-  AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
-  static bool isRequired() { return true; }
-};
-
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 323c195c329168..174a90f0aa419d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,8 +27,6 @@ MODULE_PASS("amdgpu-perf-hint",
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
-MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
-            AMDGPUExpandPseudoIntrinsicsPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c9d4452b4a035c..fed29c3e14aae2 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
-  AMDGPUExpandPseudoIntrinsics.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp

>From 1a7abaffc499ff8d54bc7b1fd76ca2fdf78b92a0 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:54:21 +0200
Subject: [PATCH 07/11] Remove pass.

---
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp   | 59 -------------------
 1 file changed, 59 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
deleted file mode 100644
index bf0ec39ab6c6e7..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic pseudo-
-// intrinsics into target specific quantities / sequences. In this context, a
-// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
-// specific instruction, but rather is intended as a mechanism for abstractly
-// conveying target specific info to a HLL / the FE, without concretely
-// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
-// This pass should run as early as possible / immediately after Clang CodeGen,
-// so that the optimisation pipeline and the BE operate with concrete target
-// data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
-                                                        Function *WaveSize) {
-  if (WaveSize->hasZeroLiveUses())
-    return PreservedAnalyses::all();
-
-  for (auto &&U : WaveSize->users())
-    U->replaceAllUsesWith(
-        ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize()));
-
-  return PreservedAnalyses::none();
-}
-
-PreservedAnalyses
-AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
-  if (M.empty())
-    return PreservedAnalyses::all();
-
-  const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
-
-  // This is not a concrete target, we should not fold early.
-  if (ST.getCPU().empty() || ST.getCPU() == "generic")
-    return PreservedAnalyses::all();
-
-  if (auto WS = Intrinsic::getDeclarationIfExists(
-          &M, Intrinsic::amdgcn_wavefrontsize))
-    return expandWaveSizeIntrinsic(ST, WS);
-
-  return PreservedAnalyses::all();
-}

>From 9aed76ceb02fd2a1b1edf68e65f9bdac6de0509e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 19:59:28 +0200
Subject: [PATCH 08/11] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index d952103aa81fdb..ae5b1292921d1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1030,7 +1030,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
         !ST->getFeatureString().contains("+wavefrontsize"))
       break;
-    return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(),
+    return IC.replaceInstUsesWith(
+        II, ConstantInt::get(II.getType(),
                                                        ST->getWavefrontSize()));
   }
   case Intrinsic::amdgcn_wqm_vote: {

>From 246c22fb2afc9ad600d897771fce8a2dc28b7ed1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 4 Nov 2024 20:38:03 +0200
Subject: [PATCH 09/11] Really fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ae5b1292921d1e..0b2548af72fc0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1031,8 +1031,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         !ST->getFeatureString().contains("+wavefrontsize"))
       break;
     return IC.replaceInstUsesWith(
-        II, ConstantInt::get(II.getType(),
-                                                       ST->getWavefrontSize()));
+        II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
   }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.

>From 7cf75589441350d6207717ab936ae61582adbc73 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 6 Nov 2024 20:55:15 +0200
Subject: [PATCH 10/11] Split tests.

---
 .../AMDGPU/llvm.amdgcn.wavefrontsize.ll       |  76 +-----------
 .../AMDGPU/llvm.amdgcn.wavefrontsize.ll       | 114 ++++++++++++++++++
 2 files changed, 115 insertions(+), 75 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index f1aed3dc00c100..33dd2bd540ad06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,21 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
@@ -24,25 +12,7 @@
 
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
-; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT:    ret void
-;
+
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -58,27 +28,6 @@ bb:
 
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
-; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT:    ret void
-;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
@@ -90,29 +39,6 @@ bb:
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
-; OPT:       [[BB2]]:
-; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    br label %[[BB3]]
-; OPT:       [[BB3]]:
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    ret void
-;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
new file mode 100644
index 00000000000000..d9c105f753e264
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, ptr addrspace(1) %arg, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32:       [[BB2]]:
+; OPT-W32-NEXT:    br label %[[BB3]]
+; OPT-W32:       [[BB3]]:
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64:       [[BB2]]:
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    br label %[[BB3]]
+; OPT-W64:       [[BB3]]:
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, ptr addrspace(1) %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }

>From ed9f19f9154ae9868a12a78b9740523f727dc98c Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 18 Nov 2024 16:54:00 +0000
Subject: [PATCH 11/11] Tweak `generic` mcpu handling.

---
 llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 0b2548af72fc0d..688519e1fb6700 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1027,7 +1027,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   case Intrinsic::amdgcn_wavefrontsize: {
     // TODO: this is a workaround for the pseudo-generic target one gets with no
     // specified mcpu, which spoofs its wave size to 64; it should be removed.
-    if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+    if ((ST->getCPU().empty() || ST->getCPU().starts_with("generic")) &&
         !ST->getFeatureString().contains("+wavefrontsize"))
       break;
     return IC.replaceInstUsesWith(