[llvm] cb3fbe9 - [AMDGPU] Set preferred function alignment based on icache geometry (#183064)

Wed Mar 11 04:57:41 PDT 2026

Author: michaelselehov
Date: 2026-03-11T07:57:37-04:00
New Revision: cb3fbe921b132a00ada5df2d2510e85d81b79cf8

URL: https://github.com/llvm/llvm-project/commit/cb3fbe921b132a00ada5df2d2510e85d81b79cf8
DIFF: https://github.com/llvm/llvm-project/commit/cb3fbe921b132a00ada5df2d2510e85d81b79cf8.diff

LOG: [AMDGPU] Set preferred function alignment based on icache geometry (#183064)

Non-entry functions were unconditionally aligned to 4 bytes with no
architecture-specific preferred alignment, and setAlignment() was used
instead of ensureAlignment(), overwriting any explicit IR attributes.

Add instruction cache line size and fetch alignment data to GCNSubtarget
for each generation (GFX9: 64B/32B, GFX10: 64B/4B, GFX11+: 128B/4B). Use
this to call setPrefFunctionAlignment() in SITargetLowering, aligning
non-entry functions to the cache line size by default. Change
setAlignment to ensureAlignment in AMDGPUAsmPrinter so explicit IR align
attributes are respected.

Empirical thread trace analysis on gfx942, gfx1030, gfx1100, and gfx1200
showed that only GFX9 exhibits measurable fetch stalls when functions
cross the 32-byte fetch window boundary. GFX10+ showed no alignment
sensitivity. A hidden option -amdgpu-align-functions-for-fetch-only is
provided to use the fetch granularity instead of cache line size.

Assisted-by: Claude Opus

Added: 
    llvm/test/CodeGen/AMDGPU/function-alignment.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
    llvm/test/CodeGen/AMDGPU/hsa-func.ll
    llvm/test/CodeGen/AMDGPU/s_code_end.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a0b6ff13e7d7a..cf638b1b5778f 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -381,6 +381,16 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
+class SubtargetFeatureInstCacheLineSize <int Value> : SubtargetFeature <
+  "instcachelinesize"#Value,
+  "InstCacheLineSize",
+  !cast<string>(Value),
+  "Instruction cache line size in bytes."
+>;
+
+def FeatureInstCacheLineSize64  : SubtargetFeatureInstCacheLineSize<64>;
+def FeatureInstCacheLineSize128 : SubtargetFeatureInstCacheLineSize<128>;
+
 defm GCN3Encoding : AMDGPUSubtargetFeature<"gcn3-encoding",
   "Encoding format for VI",
   /*GenPredicate=*/0
@@ -1333,6 +1343,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
   FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
+  FeatureInstCacheLineSize64,
   FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
   FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
@@ -1352,7 +1363,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
   FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
   FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts,
-  FeatureDX10ClampAndIEEEMode
+  FeatureDX10ClampAndIEEEMode, FeatureInstCacheLineSize64
   ]
 >;
 
@@ -1371,7 +1382,8 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
    FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts,
    FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
-   FeatureCvtPkNormVOP2Insts, FeatureDX10ClampAndIEEEMode
+   FeatureCvtPkNormVOP2Insts, FeatureDX10ClampAndIEEEMode,
+   FeatureInstCacheLineSize64
   ]
 >;
 
@@ -1393,7 +1405,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad,
    FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
    FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
-   FeatureCvtPkNormVOP3Insts, FeatureDX10ClampAndIEEEMode
+   FeatureCvtPkNormVOP3Insts, FeatureDX10ClampAndIEEEMode,
+   FeatureInstCacheLineSize64
   ]
 >;
 
@@ -1420,7 +1433,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, FeatureCubeInsts,
    FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
    FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
-   FeatureCvtPkNormVOP3Insts, FeatureDX10ClampAndIEEEMode, FeatureFlatOffsetBits12
+   FeatureCvtPkNormVOP3Insts, FeatureDX10ClampAndIEEEMode, FeatureFlatOffsetBits12,
+   FeatureInstCacheLineSize64
   ]
 >;
 
@@ -1445,7 +1459,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
    FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts,
-   FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts
+   FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts,
+   FeatureInstCacheLineSize128
   ]
 >;
 
@@ -1470,7 +1485,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
    FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureFlatOffsetBits24, FeatureFlatSignedOffset
+   FeatureFlatOffsetBits24, FeatureFlatSignedOffset, FeatureInstCacheLineSize128
   ]
 >;
 
@@ -1495,7 +1510,7 @@ def FeatureGFX13 : GCNSubtargetFeatureGeneration<"GFX13",
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
    FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureFlatOffsetBits24, FeatureFlatSignedOffset
+   FeatureFlatOffsetBits24, FeatureFlatSignedOffset, FeatureInstCacheLineSize128
   ]
 >;
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 1f83df8099803..ebd26350263ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -686,7 +686,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // The starting address of all shader programs must be 256 bytes aligned.
   // Regular functions just need the basic required instruction alignment.
-  MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
+  MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
 
   SetupMachineFunction(MF);
 

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 8c98e8b589b13..77bb36e9e60a8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -148,6 +148,15 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
 
+  // InstCacheLineSize is set from TableGen subtarget features
+  // (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).
+  // Fall back to 64 if no feature was specified (e.g. generic targets).
+  if (InstCacheLineSize == 0)
+    InstCacheLineSize = 64;
+
+  assert(llvm::isPowerOf2_32(InstCacheLineSize) &&
+         "InstCacheLineSize must be a power of 2");
+
   TargetID.setTargetIDFromFeaturesString(FS);
 
   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 012e2dd6b380b..695dd7db61e53 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -66,6 +66,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   int LDSBankCount = 0;
   unsigned MaxPrivateElementSize = 0;
 
+  // Instruction cache line size in bytes; set from TableGen subtarget features.
+  unsigned InstCacheLineSize = 0;
+
   // Dynamically set bits that enable features.
   bool DynamicVGPR = false;
   bool DynamicVGPRBlockSize32 = false;
@@ -170,6 +173,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   int getLDSBankCount() const { return LDSBankCount; }
 
+  /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
+  unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
+
   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
     return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
                                                        : 16;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 932d6a5841aab..544aca0458975 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -196,6 +196,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
+  setMinFunctionAlignment(Align(4));
+  setPrefFunctionAlignment(Align(STI.getInstCacheLineSize()));
+
   // The boolean content concept here is too inflexible. Compares only ever
   // really produce a 1-bit result. Any copy/extend from these will turn into a
   // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as

diff  --git a/llvm/test/CodeGen/AMDGPU/function-alignment.ll b/llvm/test/CodeGen/AMDGPU/function-alignment.ll
new file mode 100644
index 0000000000000..078d662a38b58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-alignment.ll
@@ -0,0 +1,116 @@
+; Test preferred alignment of non-entry functions on 
diff erent AMDGPU
+; architectures. Preferred alignment matches the instruction cache line size:
+;
+; GFX9  - cache line = 64B  (.p2align 6)
+; GFX10 - cache line = 64B  (.p2align 6)
+; GFX11 - cache line = 128B (.p2align 7)
+; GFX12 - cache line = 128B (.p2align 7)
+
+; --- Default (cache line alignment) ---
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+; --- Optsize: alignment drops to minimum (Align(4) = .p2align 2) ---
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=OPTSIZE %s
+
+; --- IR align attribute: ensureAlignment must not lower explicit alignment ---
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=EXPLICIT-ALIGN %s
+
+; --- -align-all-functions=1 with optsize: verify floor at Align(4) ---
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -align-all-functions=1 < %s | FileCheck -check-prefix=ALIGN-ALL %s
+
+; --- prefalign attribute: overrides target preferred alignment ---
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=PREFALIGN %s
+
+; --- Entry function: 256B alignment unchanged ---
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=ENTRY %s
+
+
+; Non-entry function: alignment matches instruction cache line size.
+define void @non_entry_func() {
+; GFX9:       .p2align 6{{$}}
+; GFX9:       non_entry_func:
+
+; GFX10:      .p2align 6{{$}}
+; GFX10:      non_entry_func:
+
+; GFX11:      .p2align 7{{$}}
+; GFX11:      non_entry_func:
+
+; GFX12:      .p2align 7{{$}}
+; GFX12:      non_entry_func:
+  ret void
+}
+
+; Non-entry function with optsize: must still be at least Align(4).
+define void @optsize_func() optsize {
+; OPTSIZE:          .globl optsize_func
+; OPTSIZE-NEXT:     .p2align 2{{$}}
+  ret void
+}
+
+; Non-entry function with explicit IR align 128: ensureAlignment must not lower
+; it. On GFX9 default is 64 (cache line), so 128 from IR must be preserved.
+define void @explicit_align_func() align 128 {
+; EXPLICIT-ALIGN:   .globl explicit_align_func
+; EXPLICIT-ALIGN-NEXT: .p2align 7{{$}}
+  ret void
+}
+
+; Non-entry function with explicit IR align 32 on gfx900 -- lower than
+; preferred (64), so preferred alignment wins. Result: .p2align 6.
+define void @low_align_func() align 32 {
+; GFX9:       .globl low_align_func
+; GFX9-NEXT:  .p2align 6{{$}}
+  ret void
+}
+
+; Optsize + -align-all-functions=1: MachineFunction::init sets Align(2), but
+; ensureAlignment(4) in AsmPrinter restores the floor. With optsize,
+; getPreferredAlignment returns max(Align(1), Align(4)) = Align(4).
+define void @align_all_optsize_func() optsize {
+; ALIGN-ALL:        .globl align_all_optsize_func
+; ALIGN-ALL-NEXT:   .p2align 2{{$}}
+  ret void
+}
+
+; prefalign(16) on gfx900 overrides target preferred (64) with 16.
+; getPreferredAlignment uses prefalign directly instead of getPrefFunctionAlignment.
+; Result: max(16, 4) = 16 -> .p2align 4.
+define void @prefalign_low_func() prefalign(16) {
+; PREFALIGN:        .globl prefalign_low_func
+; PREFALIGN-NEXT:   .p2align 4{{$}}
+  ret void
+}
+
+; prefalign(256) on gfx900 -- higher than target preferred (64).
+; Result: max(256, 4) = 256 -> .p2align 8.
+define void @prefalign_high_func() prefalign(256) {
+; PREFALIGN:        .globl prefalign_high_func
+; PREFALIGN-NEXT:   .p2align 8{{$}}
+  ret void
+}
+
+; prefalign(2) on gfx900 -- below the 4-byte instruction alignment floor.
+; ensureAlignment(4) in AsmPrinter guarantees the minimum.
+; Result: max(2, 4) = 4 -> .p2align 2.
+define void @prefalign_floor_func() prefalign(2) {
+; PREFALIGN:        .globl prefalign_floor_func
+; PREFALIGN-NEXT:   .p2align 2{{$}}
+  ret void
+}
+
+; Entry function: must be 256B aligned regardless of our changes.
+define amdgpu_kernel void @entry_func() {
+; ENTRY:            .globl entry_func
+; ENTRY-NEXT:       .p2align 8{{$}}
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll b/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
index 0ac6ca2904074..e10c6934765e1 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
@@ -5,11 +5,11 @@
 ; ELF: Name: .text
 ; ELF: SHF_ALLOC (0x2)
 ; ELF: SHF_EXECINSTR (0x4)
-; ELF: AddressAlignment: 32
+; ELF: AddressAlignment: 64
 ; ELF: }
 
 ; HSA: .globl simple_align16
-; HSA: .p2align 5
+; HSA: .p2align 6
 define void @simple_align16(ptr addrspace(4) %ptr.out) align 32 {
 entry:
   %out = load ptr addrspace(1), ptr addrspace(4) %ptr.out

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-func.ll
index 4ef3c994d0622..b8bdacf450fb3 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-func.ll
@@ -14,7 +14,7 @@
 ; ELF: Flags [ (0x6)
 ; ELF: SHF_ALLOC (0x2)
 ; ELF: SHF_EXECINSTR (0x4)
-; ELF: AddressAlignment: 4
+; ELF: AddressAlignment: 64
 ; ELF: }
 
 ; ELF: SHT_NOTE
@@ -38,7 +38,7 @@
 
 ; HSA-NOT: .amdgpu_hsa_kernel simple
 ; HSA: .globl simple
-; HSA: .p2align 2
+; HSA: .p2align 6
 ; HSA: {{^}}simple:
 ; HSA-NOT: amd_kernel_code_t
 ; HSA: flat_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[0:1]
@@ -60,7 +60,7 @@ entry:
 
 ; Ignore explicit alignment that is too low.
 ; HSA: .globl simple_align2
-; HSA: .p2align 2
+; HSA: .p2align 6
 define void @simple_align2(ptr addrspace(4) %ptr.out) align 2 {
 entry:
   %out = load ptr addrspace(1), ptr addrspace(4) %ptr.out

diff  --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
index ad7d8a14babc1..2579807ac9862 100644
--- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @a_kernel2() #0 {
 }
 
 ; GCN-ASM:                .globl  a_function
-; GCN-ASM-NEXT:           .p2align        2
+; GCN-ASM-NEXT:           .p2align        {{[67]}}
 ; GCN-ASM-NEXT:           .type   a_function, at function
 
 ; GCN-NEXT:       a_function{{>?}}:
@@ -55,11 +55,11 @@ define amdgpu_kernel void @a_kernel2() #0 {
 ; GFX11END-OBJ-NEXT:      s_code_end
 ; GFX90AEND-OBJ-NEXT:     s_nop 0
 
-; GFX10END-OBJ:           s_code_end // 000000000140:
+; GFX10END-OBJ:           s_code_end // {{[0-9A-F]+}}:
 ; GFX10END-OBJ-COUNT-47:  s_code_end
-; GFX11END-OBJ:           s_code_end // 000000000140:
+; GFX11END-OBJ:           s_code_end // {{[0-9A-F]+}}:
 ; GFX11END-OBJ-COUNT-47:  s_code_end
-; GFX90AEND-OBJ:           s_nop 0 // 000000000140:
+; GFX90AEND-OBJ:           s_nop 0 // {{[0-9A-F]+}}:
 ; GFX90AEND-OBJ-COUNT-255: s_nop 0
 
 define void @a_function() #0 {