[llvm] 8d4b74a - AMDGPU: Don't consider whether amdgpu-flat-work-group-size was set
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 22 13:23:55 PDT 2021
Author: Matt Arsenault
Date: 2021-10-22T16:23:50-04:00
New Revision: 8d4b74ac3f1fc90bef05ffb3ed7c0f7903f8ffe3
URL: https://github.com/llvm/llvm-project/commit/8d4b74ac3f1fc90bef05ffb3ed7c0f7903f8ffe3
DIFF: https://github.com/llvm/llvm-project/commit/8d4b74ac3f1fc90bef05ffb3ed7c0f7903f8ffe3.diff
LOG: AMDGPU: Don't consider whether amdgpu-flat-work-group-size was set
It should be semantically identical if it was set to the same value as
the default. Also improve the documentation.
Added:
llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
llvm/test/CodeGen/AMDGPU/target-cpu.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5d197d8b4373..3fa80e56f288 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -856,6 +856,8 @@ The AMDGPU backend supports the following LLVM IR attributes.
"amdgpu-flat-work-group-size"="min,max" Specify the minimum and maximum flat work group sizes that
will be specified when the kernel is dispatched. Generated
by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
+ The implied default value is 1,1024.
+
"amdgpu-implicitarg-num-bytes"="n" Number of kernel argument bytes to add to the kernel
argument block size for the implicit arguments. This
varies by OS and language (for OpenCL see
@@ -866,7 +868,12 @@ The AMDGPU backend supports the following LLVM IR attributes.
``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_.
"amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per
execution unit. Generated by the ``amdgpu_waves_per_eu``
- CLANG attribute [CLANG-ATTR]_.
+ CLANG attribute [CLANG-ATTR]_. This is an optimization hint,
+ and the backend may not be able to satisfy the request. If
+ the specified range is incompatible with the function's
+ "amdgpu-flat-work-group-size" value, the implied occupancy
+ bounds by the workgroup size takes precedence.
+
"amdgpu-ieee" true/false. Specify whether the function expects the IEEE field of the
mode register to be set on entry. Overrides the default for
the calling convention.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1d3b12380bb9..d19431a59dbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -544,8 +544,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
unsigned MinImpliedByFlatWorkGroupSize =
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
Default.first = MinImpliedByFlatWorkGroupSize;
- bool RequestedFlatWorkGroupSize =
- F.hasFnAttribute("amdgpu-flat-work-group-size");
// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -562,8 +560,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// Make sure requested values are compatible with values implied by requested
// minimum/maximum flat work group sizes.
- if (RequestedFlatWorkGroupSize &&
- Requested.first < MinImpliedByFlatWorkGroupSize)
+ if (Requested.first < MinImpliedByFlatWorkGroupSize)
return Default;
return Requested;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c55fff82f117..ecff0fb2b1ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -16,7 +16,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
-; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
@@ -158,10 +157,23 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:260
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:264
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:268
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:272
-; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:276
-; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:280
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:284
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:272
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:276
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:280
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:284
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:540 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:288
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:292
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:296
@@ -218,38 +230,40 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:500
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508
-; GCN-NEXT: s_waitcnt vmcnt(60)
-; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21]
-; GCN-NEXT: s_waitcnt vmcnt(57)
-; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
-; GCN-NEXT: s_waitcnt vmcnt(54)
-; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
-; GCN-NEXT: s_waitcnt vmcnt(51)
-; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
-; GCN-NEXT: s_waitcnt vmcnt(48)
-; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
-; GCN-NEXT: s_waitcnt vmcnt(45)
-; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
-; GCN-NEXT: s_waitcnt vmcnt(42)
-; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
-; GCN-NEXT: s_waitcnt vmcnt(39)
-; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
-; GCN-NEXT: s_waitcnt vmcnt(36)
-; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
-; GCN-NEXT: s_waitcnt vmcnt(33)
-; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
-; GCN-NEXT: s_waitcnt vmcnt(30)
-; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
-; GCN-NEXT: s_waitcnt vmcnt(27)
-; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
-; GCN-NEXT: s_waitcnt vmcnt(24)
-; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
-; GCN-NEXT: s_waitcnt vmcnt(21)
-; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
-; GCN-NEXT: s_waitcnt vmcnt(18)
-; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
-; GCN-NEXT: s_waitcnt vmcnt(15)
-; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21]
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21] offset:16
+; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[20:21] offset:32
+; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[20:21] offset:48
+; GCN-NEXT: global_store_dwordx4 v0, v[16:19], s[20:21] offset:64
+; GCN-NEXT: global_store_dwordx4 v0, v[20:23], s[20:21] offset:80
+; GCN-NEXT: global_store_dwordx4 v0, v[24:27], s[20:21] offset:96
+; GCN-NEXT: global_store_dwordx4 v0, v[28:31], s[20:21] offset:112
+; GCN-NEXT: global_store_dwordx4 v0, v[32:35], s[20:21] offset:128
+; GCN-NEXT: global_store_dwordx4 v0, v[36:39], s[20:21] offset:144
+; GCN-NEXT: global_store_dwordx4 v0, v[40:43], s[20:21] offset:160
+; GCN-NEXT: global_store_dwordx4 v0, v[44:47], s[20:21] offset:176
+; GCN-NEXT: global_store_dwordx4 v0, v[48:51], s[20:21] offset:192
+; GCN-NEXT: global_store_dwordx4 v0, v[52:55], s[20:21] offset:208
+; GCN-NEXT: global_store_dwordx4 v0, v[56:59], s[20:21] offset:224
+; GCN-NEXT: global_store_dwordx4 v0, v[60:63], s[20:21] offset:240
; GCN-NEXT: s_endpgm
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
@@ -257,4 +271,4 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
ret void
}
-attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
+attributes #0 = { "amdgpu-flat-workgroup-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index e2e5e3369bd9..3d4acd79132d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -108,5 +108,5 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
attributes #1 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index a5e97205de21..1edec164ef26 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 {
entry:
ret void
}
-attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
+attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" }
; Exactly 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_5:
@@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 {
entry:
ret void
}
-attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
+attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"}
; At most 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_10:
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
new file mode 100644
index 000000000000..997cbc2ea461
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
+
+; Both of these kernels have the same value for
+; amdgpu-flat-work-group-size, except one explicitly sets it. This is
+; a program visible property which should always take precedence over
+; the amdgpu-waves-per-eu optimization hint.
+;
+; The range is incompatible with the amdgpu-waves-per-eu value, so the
+; flat work group size should take precedence implying a requirement
+; to support 1024 size workgroups (which exceeds the available LDS
+; amount).
+
+; CHECK-NOT: @no_flat_workgroup_size.stack
+; CHECK-NOT: @explicit_default_workgroup_size.stack
+
+; CHECK-LABEL: @no_flat_workgroup_size(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+define amdgpu_kernel void @no_flat_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4, addrspace(5)
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
+ store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
+ store i32 5, i32 addrspace(5)* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
+ %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
+ %3 = load i32, i32 addrspace(5)* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK-LABEL: @explicit_default_workgroup_size(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+define amdgpu_kernel void @explicit_default_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+entry:
+ %stack = alloca [5 x i32], align 4, addrspace(5)
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
+ store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
+ store i32 5, i32 addrspace(5)* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
+ %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
+ %3 = load i32, i32 addrspace(5)* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 25e0376dd7ee..a503a79827fe 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -392,7 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
ret void
}
-attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
+attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
index a313664cb0e5..437e3a78ad59 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -585,5 +585,5 @@ bb:
; Function Attrs: nounwind readnone
declare float @llvm.fmuladd.f32(float, float, float) #1
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
index bc26b3fa19a6..8ad70cf95c09 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -588,4 +588,4 @@ bb:
declare float @llvm.fmuladd.f32(float, float, float) #0
attributes #0 = { nounwind readnone }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
index 4750cf2020d9..9a56e85deceb 100644
--- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
@@ -107,5 +107,5 @@ attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "target-cpu"="tahiti" }
attributes #3 = { nounwind "target-cpu"="bonaire" }
attributes #4 = { nounwind "target-cpu"="fiji" }
-attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" }
-attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" }
+attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }
+attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }
More information about the llvm-commits
mailing list