[clang] 48ec59c - [llvm][AMDGPU] Fold `llvm.amdgcn.wavefrontsize` early (#114481)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 25 02:29:54 PST 2024
Author: Alex Voicu
Date: 2024-11-25T10:29:50Z
New Revision: 48ec59c234ce267a0454b15e9a79a326e21a4a97
URL: https://github.com/llvm/llvm-project/commit/48ec59c234ce267a0454b15e9a79a326e21a4a97
DIFF: https://github.com/llvm/llvm-project/commit/48ec59c234ce267a0454b15e9a79a326e21a4a97.diff
LOG: [llvm][AMDGPU] Fold `llvm.amdgcn.wavefrontsize` early (#114481)
Fold `llvm.amdgcn.wavefrontsize` early, during InstCombine, so that it's
concrete value is used throughout subsequent optimisation passes.
Added:
llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
Modified:
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
Removed:
################################################################################
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 3bc6107b7fd40d..c22a43146a8c89 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
// CHECK-LABEL test_wavefrontsize(
unsigned test_wavefrontsize() {
- // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+ // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+ // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
return __builtin_amdgcn_wavefrontsize();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 087de1bed86f76..18a09c39a06387 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
break;
}
+ case Intrinsic::amdgcn_wavefrontsize: {
+ if (ST->isWaveSizeKnown())
+ return IC.replaceInstUsesWith(
+ II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
+ break;
+ }
case Intrinsic::amdgcn_wqm_vote: {
// wqm_vote is identity when the argument is constant.
if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 824d3708c027db..33dd2bd540ad06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -4,29 +4,15 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-
; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -34,18 +20,12 @@ bb:
}
; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: %tmp1 = icmp ugt i32 %tmp, 32
-; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT: store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
bb:
@@ -57,13 +37,6 @@ bb:
}
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT: bb:
-; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT: %tmp1 = icmp ugt i32 %tmp, 32
-; OPT: bb3:
-; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
bb:
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
new file mode 100644
index 00000000000000..d9c105f753e264
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+ store i32 %tmp, ptr addrspace(1) %arg, align 4
+ ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+ %tmp1 = icmp ugt i32 %tmp, 32
+ %tmp2 = select i1 %tmp1, i32 2, i32 1
+ store i32 %tmp2, ptr addrspace(1) %arg
+ ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT: [[BB2]]:
+; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT: br label %[[BB3]]
+; OPT: [[BB3]]:
+; OPT-NEXT: ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT: [[BB:.*:]]
+; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32: [[BB2]]:
+; OPT-W32-NEXT: br label %[[BB3]]
+; OPT-W32: [[BB3]]:
+; OPT-W32-NEXT: ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT: [[BB:.*:]]
+; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64: [[BB2]]:
+; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT: br label %[[BB3]]
+; OPT-W64: [[BB3]]:
+; OPT-W64-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+ %tmp1 = icmp ugt i32 %tmp, 32
+ br i1 %tmp1, label %bb2, label %bb3
+
+bb2: ; preds = %bb
+ store i32 1, ptr addrspace(1) %arg, align 4
+ br label %bb3
+
+bb3: ; preds = %bb2, %bb
+ ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }
More information about the cfe-commits
mailing list