[llvm] e377dc4 - [AMDGPU] Max. WG size-induced occupancy limits max. waves/EU (#137807)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 04:22:26 PDT 2025
Author: Lucas Ramirez
Date: 2025-05-01T13:22:23+02:00
New Revision: e377dc4d38b69050a3301c68637d1b6dacaee3a9
URL: https://github.com/llvm/llvm-project/commit/e377dc4d38b69050a3301c68637d1b6dacaee3a9
DIFF: https://github.com/llvm/llvm-project/commit/e377dc4d38b69050a3301c68637d1b6dacaee3a9.diff
LOG: [AMDGPU] Max. WG size-induced occupancy limits max. waves/EU (#137807)
The default maximum waves/EU returned by the family of
`AMDGPUSubtarget::getWavesPerEU` is currently the maximum number of
waves/EU supported by the subtarget (only a valid occupancy range in
"amdgpu-waves-per-eu" may lower that maximum). This ignores maximum
achievable occupancy imposed by flat workgroup size and LDS usage,
resulting in situations where `AMDGPUSubtarget::getWavesPerEU` produces
a maximum higher than the one from
`AMDGPUSubtarget::getOccupancyWithWorkGroupSizes`.
This limits the waves/EU range's maximum to the maximum achievable
occupancy derived from flat workgroup sizes and LDS usage. This only has
an impact on functions which restrict flat workgroup size with
"amdgpu-flat-work-group-size", since the default range of flat workgroup
sizes achieves the maximum number of waves/EU supported by the
subtarget.
Improvements to the handling of "amdgpu-waves-per-eu" are left for a
follow up PR (e.g., I think the attribute should be able to lower the
full range of waves/EU produced by these methods).
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
llvm/test/CodeGen/AMDGPU/load-global-i16.ll
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 87fa845f3cff7..b9ce8dc0c5cdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -209,7 +209,7 @@ class AMDGPUInformationCache : public InformationCache {
getWavesPerEU(const Function &F,
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- return ST.getWavesPerEU(F, FlatWorkGroupSize);
+ return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
}
std::optional<std::pair<unsigned, unsigned>>
@@ -230,7 +230,8 @@ class AMDGPUInformationCache : public InformationCache {
std::pair<unsigned, unsigned> WavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
+ return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
+ getLDSSize(F));
}
unsigned getMaxWavesPerEU(const Function &F) {
@@ -255,6 +256,14 @@ class AMDGPUInformationCache : public InformationCache {
return Status;
}
+ /// Returns the minimum amount of LDS space used by a workgroup running
+ /// function \p F.
+ static unsigned getLDSSize(const Function &F) {
+ return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
+ {0, UINT32_MAX}, true)
+ .first;
+ }
+
/// Get the constant access bitmap for \p C.
uint8_t getConstantAccess(const Constant *C,
SmallPtrSetImpl<const Constant *> &Visited) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 6c01f6dd370f1..933ee6ceeaf4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -195,12 +195,14 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
}
};
-unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
+static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
+ const Function &F) {
if (!TM.getTargetTriple().isAMDGCN())
return 128;
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(
+ ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
// A non-entry function has only 32 caller preserved registers.
// Do not promote alloca which will force spilling unless we know the function
@@ -336,10 +338,9 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
if (!ST.isPromoteAllocaEnabled())
return false;
- MaxVGPRs = getMaxVGPRs(TM, F);
- setFunctionLimits(F);
-
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
+ MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
+ setFunctionLimits(F);
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
@@ -1452,29 +1453,14 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
unsigned MaxOccupancy =
- ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;
-
- // Restrict local memory usage so that we don't drastically reduce occupancy,
- // unless it is already significantly reduced.
-
- // TODO: Have some sort of hint or other heuristics to guess occupancy based
- // on other factors..
- unsigned OccupancyHint = ST.getWavesPerEU(F).second;
- if (OccupancyHint == 0)
- OccupancyHint = 7;
-
- // Clamp to max value.
- OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
- // Check the hint but ignore it if it's obviously wrong from the existing LDS
- // usage.
- MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+ ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
+ .second;
// Round up to the next tier of usage.
unsigned MaxSizeWithWaveCount =
ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
- // Program is possibly broken by using more local mem than available.
+ // Program may already use more LDS than is usable at maximum occupancy.
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4373528d6d517..563605f964cc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -55,9 +55,9 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() / WorkGroupsPerCU;
}
-std::pair<unsigned, unsigned>
-AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
- const Function &F) const {
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
+ uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
+
// FIXME: We should take into account the LDS allocation granularity.
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
@@ -81,7 +81,7 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
// workgroups, maximum number of waves, and minimum occupancy. The opposite is
// generally true for the minimum group size. LDS or barrier ressource
// limitations can flip those minimums/maximums.
- const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
+ const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
@@ -180,45 +180,52 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
}
std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
- std::pair<unsigned, unsigned> Requested,
- std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
- // Default minimum/maximum number of waves per execution unit.
- std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
-
- // If minimum/maximum flat work group sizes were explicitly requested using
- // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
- // number of waves per execution unit to values implied by requested
- // minimum/maximum flat work group sizes.
- unsigned MinImpliedByFlatWorkGroupSize =
- getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
- Default.first = MinImpliedByFlatWorkGroupSize;
+ std::pair<unsigned, unsigned> RequestedWavesPerEU,
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
+ // Default minimum/maximum number of waves per EU. The range of flat workgroup
+ // sizes limits the achievable maximum, and we aim to support enough waves per
+ // EU so that we can concurrently execute all waves of a single workgroup of
+ // maximum size on a CU.
+ std::pair<unsigned, unsigned> Default = {
+ getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
+ getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
+ Default.first = std::min(Default.first, Default.second);
// Make sure requested minimum is less than requested maximum.
- if (Requested.second && Requested.first > Requested.second)
+ if (RequestedWavesPerEU.second &&
+ RequestedWavesPerEU.first > RequestedWavesPerEU.second)
return Default;
- // Make sure requested values do not violate subtarget's specifications.
- if (Requested.first < getMinWavesPerEU() ||
- Requested.second > getMaxWavesPerEU())
+ // Make sure requested values do not violate subtarget's specifications and
+ // are compatible with values implied by minimum/maximum flat workgroup sizes.
+ if (RequestedWavesPerEU.first < Default.first ||
+ RequestedWavesPerEU.second > Default.second)
return Default;
- // Make sure requested values are compatible with values implied by requested
- // minimum/maximum flat work group sizes.
- if (Requested.first < MinImpliedByFlatWorkGroupSize)
- return Default;
+ return RequestedWavesPerEU;
+}
- return Requested;
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
+ // Default/requested minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
+ // Minimum number of bytes allocated in the LDS.
+ unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
+ {0, UINT32_MAX}, true)
+ .first;
+ return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
}
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
- const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
+ unsigned LDSBytes, const Function &F) const {
// Default minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
- return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
+ return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
}
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index a71731ecf8a3f..91fe2a69bc0b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -106,21 +106,24 @@ class AMDGPUSubtarget {
/// be converted to integer, violate subtarget's specifications, or are not
/// compatible with minimum/maximum number of waves limited by flat work group
/// size, register usage, and/or lds usage.
- std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
- // Default/requested minimum/maximum flat work group sizes.
- std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
- return getWavesPerEU(F, FlatWorkGroupSizes);
- }
+ std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
- /// Overload which uses the specified values for the flat work group sizes,
- /// rather than querying the function itself. \p FlatWorkGroupSizes Should
- /// correspond to the function's value for getFlatWorkGroupSizes.
+ /// Overload which uses the specified values for the flat workgroup sizes and
+ /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
+ /// should correspond to the function's value for getFlatWorkGroupSizes and \p
+ /// LDSBytes to the per-workgroup LDS allocation.
std::pair<unsigned, unsigned>
- getWavesPerEU(const Function &F,
- std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
- std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
- std::pair<unsigned, unsigned> WavesPerEU,
- std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
+ getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
+ unsigned LDSBytes, const Function &F) const;
+
+ /// Returns the target minimum/maximum number of waves per EU. This is based
+ /// on the minimum/maximum number of \p RequestedWavesPerEU and further
+ /// limited by the maximum achievable occupancy derived from the range of \p
+ /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+ std::pair<unsigned, unsigned>
+ getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes,
+ unsigned LDSBytes) const;
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
@@ -133,7 +136,16 @@ class AMDGPUSubtarget {
/// This notably depends on the range of allowed flat group sizes for the
/// function and hardware characteristics.
std::pair<unsigned, unsigned>
- getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
+ getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
+ return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F));
+ }
+
+ /// Overload which uses the specified values for the flat work group sizes,
+ /// rather than querying the function itself. \p FlatWorkGroupSizes should
+ /// correspond to the function's value for getFlatWorkGroupSizes.
+ std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
+ uint32_t LDSBytes,
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p MF. This notably
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 22cc5af30da66..616867481d177 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -24,10 +24,10 @@ entry:
attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
; CHECK-LABEL: {{^}}min_128_max_128:
-; CHECK: SGPRBlocks: 0
-; CHECK: VGPRBlocks: 0
-; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: SGPRBlocks: 8
+; CHECK: VGPRBlocks: 7
+; CHECK: NumSGPRsForWavesPerEU: 65
+; CHECK: NumVGPRsForWavesPerEU: 29
define amdgpu_kernel void @min_128_max_128() #2 {
entry:
ret void
@@ -35,9 +35,9 @@ entry:
attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 8
; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 65
; CHECK: NumVGPRsForWavesPerEU: 43
@var = addrspace(1) global float 0.0
define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 9054e509cde8e..b19486b0e7671 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -6581,50 +6581,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v4
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v7
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
index 1e5d6755fbc85..bd1258cb1cf98 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -42,4 +42,4 @@ bb2:
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }
-attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #1 = { "amdgpu-num-vgpr"="9" }
More information about the llvm-commits
mailing list