[PATCH] D115488: [amdgpu] Increase alignment of all LDS variables

Thu Dec 9 17:55:04 PST 2021

JonChesterfield created this revision.
JonChesterfield added reviewers: hsmhsm, arsenm, b-sumner, t-tye, rampitec, ronlieb, foad.
Herald added subscribers: kerbowa, hiraditya, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl.
JonChesterfield requested review of this revision.
Herald added subscribers: llvm-commits, wdng.
Herald added a project: LLVM.

Currently the superalign option only increases the alignment of
variables that are moved into the module.lds block. Change that to all LDS
variables. Also only increase the alignment once, instead of once per function.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D115488

Files:
  llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
  llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll


Index: llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s
 
@@ -8,6 +8,10 @@
 ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] }
 ; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i32 addrspace(3)*] }
 
+; SUPER-ALIGN_ON: @lds.unused = addrspace(3) global i32 undef, align 4
+; SUPER-ALIGN_OFF: @lds.unused = addrspace(3) global i32 undef, align 2
+ at lds.unused = addrspace(3) global i32 undef, align 2
+
 ; CHECK-NOT: @lds.1
 @lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
 
Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -151,7 +151,9 @@
   }
 
   bool runOnModule(Module &M) override {
-    bool Changed = processUsedLDS(M);
+    bool Changed = superAlignLDSGlobals(M);
+
+    Changed |= processUsedLDS(M);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -167,6 +169,50 @@
   }
 
 private:
+  // Increase the alignment of LDS globals if necessary to maximise the chance
+  // that we can use aligned LDS instructions to access them.
+  static bool superAlignLDSGlobals(Module &M) {
+    const DataLayout &DL = M.getDataLayout();
+    bool Changed = false;
+    if (!SuperAlignLDSGlobals) {
+      return Changed;
+    }
+
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        // Only changing alignment of LDS variables
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // cuda/hip extern __shared__ variable, leave alignment alone
+        continue;
+      }
+
+      Align Alignment = AMDGPU::getAlign(DL, &GV);
+      TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+
+      if (GVSize > 8) {
+        // We might want to use a b96 or b128 load/store
+        Alignment = std::max(Alignment, Align(16));
+      } else if (GVSize > 4) {
+        // We might want to use a b64 load/store
+        Alignment = std::max(Alignment, Align(8));
+      } else if (GVSize > 2) {
+        // We might want to use a b32 load/store
+        Alignment = std::max(Alignment, Align(4));
+      } else if (GVSize > 1) {
+        // We might want to use a b16 load/store
+        Alignment = std::max(Alignment, Align(2));
+      }
+
+      if (Alignment != AMDGPU::getAlign(DL, &GV)) {
+        Changed = true;
+        GV.setAlignment(Alignment);
+      }
+    }
+    return Changed;
+  }
+
   bool processUsedLDS(Module &M, Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D115488.393352.patch
Type: text/x-patch
Size: 3790 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20211210/912e2c0b/attachment.bin>