[llvm] d797a7f - [AMDGPU] Use performOptimizedStructLayout for LDS sort
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 22 09:58:24 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-06-22T09:58:10-07:00
New Revision: d797a7f8da18e39d8ce1913175601faf2002a005
URL: https://github.com/llvm/llvm-project/commit/d797a7f8da18e39d8ce1913175601faf2002a005
DIFF: https://github.com/llvm/llvm-project/commit/d797a7f8da18e39d8ce1913175601faf2002a005.diff
LOG: [AMDGPU] Use performOptimizedStructLayout for LDS sort
This gives better packing.
Differential Revision: https://reviews.llvm.org/D104331
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 48e3ad68a4f4..30acbc9110d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -46,8 +46,8 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
#include <vector>
#define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -210,35 +210,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
}
- // Sort by alignment, descending, to minimise padding.
- // On ties, sort by size, descending, then by name, lexicographical.
- llvm::stable_sort(
- FoundLocalVars,
- [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
- Align ALHS = AMDGPU::getAlign(DL, LHS);
- Align ARHS = AMDGPU::getAlign(DL, RHS);
- if (ALHS != ARHS) {
- return ALHS > ARHS;
- }
-
- TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
- TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
- if (SLHS != SRHS) {
- return SLHS > SRHS;
- }
-
- // By variable name on tie for predictable order in test cases.
- return LHS->getName() < RHS->getName();
- });
+ SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+ LayoutFields.reserve(FoundLocalVars.size());
+ for (GlobalVariable *GV : FoundLocalVars) {
+ OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
+ AMDGPU::getAlign(DL, GV));
+ LayoutFields.emplace_back(F);
+ }
+
+ performOptimizedStructLayout(LayoutFields);
std::vector<GlobalVariable *> LocalVars;
LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
{
// This usually won't need to insert any padding, perhaps avoid the alloc
uint64_t CurrentOffset = 0;
- for (size_t I = 0; I < FoundLocalVars.size(); I++) {
- GlobalVariable *FGV = FoundLocalVars[I];
- Align DataAlign = AMDGPU::getAlign(DL, FGV);
+ for (size_t I = 0; I < LayoutFields.size(); I++) {
+ GlobalVariable *FGV = static_cast<GlobalVariable *>(
+ const_cast<void *>(LayoutFields[I].Id));
+ Align DataAlign = LayoutFields[I].Alignment;
uint64_t DataAlignV = DataAlign.value();
if (uint64_t Rem = CurrentOffset % DataAlignV) {
@@ -257,7 +247,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
LocalVars.push_back(FGV);
- CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+ CurrentOffset += LayoutFields[I].Size;
}
}
@@ -272,14 +262,14 @@ class AMDGPULowerModuleLDS : public ModulePass {
: "llvm.amdgcn.module.lds");
StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
- Align MaxAlign =
- AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
+ Align StructAlign =
+ AMDGPU::getAlign(DL, LocalVars[0]);
GlobalVariable *SGV = new GlobalVariable(
M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
false);
- SGV->setAlignment(MaxAlign);
+ SGV->setAlignment(StructAlign);
if (!F) {
appendToCompilerUsed(
M, {static_cast<GlobalValue *>(
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
index 948c07dcc22c..f7070a6adcf9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -20,7 +20,7 @@
; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4
; CHECK-LABEL: @get_func()
-; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
define i32 @get_func() local_unnamed_addr #0 {
entry:
%0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -28,7 +28,7 @@ entry:
}
; CHECK-LABEL: @set_func(i32 %x)
-; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
define void @set_func(i32 %x) local_unnamed_addr #1 {
entry:
store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -40,14 +40,14 @@ entry:
; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
; CHECK: %3 = ptrtoint i32* %2 to i64
-; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), %3
+; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3
; CHECK: %5 = inttoptr i64 %4 to i32*
; CHECK: %ld = load i32, i32* %5, align 4
; CHECK: %mul = mul i32 %ld, 2
; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
; CHECK: %8 = ptrtoint i32* %7 to i64
-; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)
+; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)
; CHECK: %10 = inttoptr i64 %9 to i32*
; CHECK: store i32 %mul, i32* %10, align 4
define amdgpu_kernel void @timestwo() {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
index 0ce2aba7ffde..104c87774a72 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
@@ -16,7 +16,7 @@
; But none of them are used anywhere. Hence, @lds.6 is not lowered.
;.
-; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8] }
; CHECK-NOT: @lds.1
; CHECK-NOT: @lds.2
@@ -41,7 +41,7 @@
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
-; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 4)
+; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2)
; CHECK: @alias.to.lds.2 = alias [2 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3)
; CHECK: @alias.to.gptr.3 = alias i64*, i64* addrspace(1)* @gptr.3
; CHECK: @alias.to.gptr.4 = alias i64*, i64* addrspace(1)* @gptr.4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
index 7985ecff4044..77fcefa7944d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
@@ -17,7 +17,7 @@
; and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered.
;.
-; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [1 x float], [2 x float] }
; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
@@ -36,7 +36,7 @@
; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8
-; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i64 addrspace(3)*) to i64*), align 8
; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8
; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8
; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8
diff --git a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
index 167834409a0d..6c5669ed2a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
@@ -5,10 +5,10 @@
; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] }
; Different properly aligned values, but same size of 1.
-; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8], [3 x i8], [1 x i8] }
; All are under-aligned, requires to fix each on
diff erent alignment boundary.
-; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [1 x i8], [2 x i8], [3 x i8], [1 x i8], [5 x i8] }
; All LDS are underaligned, requires to allocate on 8 byte boundary
; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] }
More information about the llvm-commits
mailing list