[llvm] d797a7f - [AMDGPU] Use performOptimizedStructLayout for LDS sort

Tue Jun 22 09:58:24 PDT 2021

Author: Stanislav Mekhanoshin
Date: 2021-06-22T09:58:10-07:00
New Revision: d797a7f8da18e39d8ce1913175601faf2002a005

URL: https://github.com/llvm/llvm-project/commit/d797a7f8da18e39d8ce1913175601faf2002a005
DIFF: https://github.com/llvm/llvm-project/commit/d797a7f8da18e39d8ce1913175601faf2002a005.diff

LOG: [AMDGPU] Use performOptimizedStructLayout for LDS sort

This gives better packing.

Differential Revision: https://reviews.llvm.org/D104331

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
    llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 48e3ad68a4f4..30acbc9110d5 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -46,8 +46,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
 #include <vector>
 
 #define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -210,35 +210,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
       }
     }
 
-    // Sort by alignment, descending, to minimise padding.
-    // On ties, sort by size, descending, then by name, lexicographical.
-    llvm::stable_sort(
-        FoundLocalVars,
-        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
-          Align ALHS = AMDGPU::getAlign(DL, LHS);
-          Align ARHS = AMDGPU::getAlign(DL, RHS);
-          if (ALHS != ARHS) {
-            return ALHS > ARHS;
-          }
-
-          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
-          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
-          if (SLHS != SRHS) {
-            return SLHS > SRHS;
-          }
-
-          // By variable name on tie for predictable order in test cases.
-          return LHS->getName() < RHS->getName();
-        });
+    SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+    LayoutFields.reserve(FoundLocalVars.size());
+    for (GlobalVariable *GV : FoundLocalVars) {
+      OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
+                                   AMDGPU::getAlign(DL, GV));
+      LayoutFields.emplace_back(F);
+    }
+
+    performOptimizedStructLayout(LayoutFields);
 
     std::vector<GlobalVariable *> LocalVars;
     LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
     {
       // This usually won't need to insert any padding, perhaps avoid the alloc
       uint64_t CurrentOffset = 0;
-      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
-        GlobalVariable *FGV = FoundLocalVars[I];
-        Align DataAlign = AMDGPU::getAlign(DL, FGV);
+      for (size_t I = 0; I < LayoutFields.size(); I++) {
+        GlobalVariable *FGV = static_cast<GlobalVariable *>(
+            const_cast<void *>(LayoutFields[I].Id));
+        Align DataAlign = LayoutFields[I].Alignment;
 
         uint64_t DataAlignV = DataAlign.value();
         if (uint64_t Rem = CurrentOffset % DataAlignV) {
@@ -257,7 +247,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
         }
 
         LocalVars.push_back(FGV);
-        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+        CurrentOffset += LayoutFields[I].Size;
       }
     }
 
@@ -272,14 +262,14 @@ class AMDGPULowerModuleLDS : public ModulePass {
           : "llvm.amdgcn.module.lds");
     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
 
-    Align MaxAlign =
-        AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
+    Align StructAlign =
+        AMDGPU::getAlign(DL, LocalVars[0]);
 
     GlobalVariable *SGV = new GlobalVariable(
         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
         false);
-    SGV->setAlignment(MaxAlign);
+    SGV->setAlignment(StructAlign);
     if (!F) {
       appendToCompilerUsed(
           M, {static_cast<GlobalValue *>(

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
index 948c07dcc22c..f7070a6adcf9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -20,7 +20,7 @@
 ; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4
 
 ; CHECK-LABEL: @get_func()
-; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define i32 @get_func() local_unnamed_addr #0 {
 entry:
   %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -28,7 +28,7 @@ entry:
 }
 
 ; CHECK-LABEL: @set_func(i32 %x)
-; CHECK:  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define void @set_func(i32 %x) local_unnamed_addr #1 {
 entry:
   store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -40,14 +40,14 @@ entry:
 ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
 ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
 ; CHECK: %3 = ptrtoint i32* %2 to i64
-; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), %3
+; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3
 ; CHECK: %5 = inttoptr i64 %4 to i32*
 ; CHECK: %ld = load i32, i32* %5, align 4
 ; CHECK: %mul = mul i32 %ld, 2
 ; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
 ; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
 ; CHECK: %8 = ptrtoint i32* %7 to i64
-; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)
+; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)
 ; CHECK: %10 = inttoptr i64 %9 to i32*
 ; CHECK: store i32 %mul, i32* %10, align 4
 define amdgpu_kernel void @timestwo() {

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
index 0ce2aba7ffde..104c87774a72 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
@@ -16,7 +16,7 @@
 ;          But none of them are used anywhere. Hence, @lds.6 is not lowered.
 ;.
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8] }
 
 ; CHECK-NOT: @lds.1
 ; CHECK-NOT: @lds.2
@@ -41,7 +41,7 @@
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
 ; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
 
-; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 4)
+; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2)
 ; CHECK: @alias.to.lds.2 = alias [2 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3)
 ; CHECK: @alias.to.gptr.3 = alias i64*, i64* addrspace(1)* @gptr.3
 ; CHECK: @alias.to.gptr.4 = alias i64*, i64* addrspace(1)* @gptr.4

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
index 7985ecff4044..77fcefa7944d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
@@ -17,7 +17,7 @@
 ;          and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered.
 ;.
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [1 x float], [2 x float] }
 
 ; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
 ; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
@@ -36,7 +36,7 @@
 
 ; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
 ; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8
-; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i64 addrspace(3)*) to i64*), align 8
 ; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8
 ; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8
 ; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
index 167834409a0d..6c5669ed2a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
@@ -5,10 +5,10 @@
 ; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] }
 
 ; Different properly aligned values, but same size of 1.
-; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8], [3 x i8], [1 x i8] }
 
 ; All are under-aligned, requires to fix each on 
diff erent alignment boundary.
-; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [1 x i8], [2 x i8], [3 x i8], [1 x i8], [5 x i8] }
 
 ; All LDS are underaligned, requires to allocate on 8 byte boundary
 ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] }