[llvm] [AMDGPU] Enable kernarg preloading by default on gfx940 (PR #110691)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 4 17:50:29 PDT 2024
https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/110691
>From 2cfa135fa692556c3d12600d33ee2c31118da682 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 1 Oct 2024 07:22:38 -0700
Subject: [PATCH 1/3] [AMDGPU] Enable kernarg preloading by default on gfx940
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 45 +++++-
.../AMDGPU/preload-kernargs-IR-lowering.ll | 128 +++++++++---------
.../AMDGPU/preload-kernargs-inreg-hints.ll | 2 +-
3 files changed, 106 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 687a7339da379d..e4ecbce8811069 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1014,12 +1014,49 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
+static unsigned getMaxNumPreloadArgs(const Function &F, const DataLayout &DL,
+ const TargetMachine &TM) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ unsigned Offset = 0;
+ unsigned ArgsToPreload = 0;
+ for (const auto &Arg : F.args()) {
+ if (Arg.hasByRefAttr())
+ break;
+
+ Type *Ty = Arg.getType();
+ Align ArgAlign = DL.getABITypeAlign(Ty);
+ auto Size = DL.getTypeAllocSize(Ty);
+ Offset = alignTo(Offset, ArgAlign);
+ if (((Offset + Size) / 4) > ST.getMaxNumUserSGPRs())
+ break;
+
+ Offset += Size;
+ ArgsToPreload++;
+ }
+
+ return ArgsToPreload;
+}
+
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- for (unsigned I = 0;
- I < F.arg_size() &&
- I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
- ++I) {
+ if (!ST.hasKernargPreload())
+ return;
+
+ // Enable kernarg preloading by default on GFX940+.
+ size_t PreloadCount;
+ if (KernargPreloadCount.getNumOccurrences() > 0) {
+ // Override default behavior if CL option is present.
+ PreloadCount = std::min<size_t>(KernargPreloadCount, F.arg_size());
+ } else {
+ // Defaults with no CL option.
+ if (ST.hasGFX940Insts())
+ PreloadCount =
+ getMaxNumPreloadArgs(F, F.getParent()->getDataLayout(), TM);
+ else
+ PreloadCount = 0;
+ }
+
+ for (unsigned I = 0; I < PreloadCount; ++I) {
Argument &Arg = *F.getArg(I);
// Check for incompatible attributes.
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
index ab0fb7584d50ce..3a5e72b0408bf7 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=0 -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=PRELOAD-DEFAULT %s
define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
@@ -32,12 +32,12 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %i
; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
store i32 %load, ptr addrspace(1) %out
@@ -88,14 +88,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -184,20 +184,20 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i
; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
-; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
-; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
+; PRELOAD-DEFAULT-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -254,14 +254,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -312,14 +312,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -385,16 +385,16 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
-; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
+; PRELOAD-DEFAULT-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-DEFAULT-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
+; PRELOAD-DEFAULT-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -450,14 +450,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0,
; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
-; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
-; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
+; PRELOAD-DEFAULT-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-DEFAULT-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
+; PRELOAD-DEFAULT-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; PRELOAD-DEFAULT-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
index 20edbd6c0d0fa6..bb08aa18cd8dea 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
@@ -95,7 +95,7 @@ define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x floa
; PRELOAD-16-NEXT: ret void
;
; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 inreg [[TMP16:%.*]], i32 inreg [[TMP17:%.*]]) #[[ATTR0]] {
; PRELOAD-20-NEXT: ret void
;
ret void
>From 529a8db12f5f31ce586c39decea3de7ed47eedab Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Wed, 2 Oct 2024 13:30:38 -0700
Subject: [PATCH 2/3] Add predicate and match types.
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 6 +++---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +++
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index e4ecbce8811069..5ace7cc700bcef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1044,19 +1044,19 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
// Enable kernarg preloading by default on GFX940+.
size_t PreloadCount;
- if (KernargPreloadCount.getNumOccurrences() > 0) {
+ if (KernargPreloadCount.getNumOccurrences()) {
// Override default behavior if CL option is present.
PreloadCount = std::min<size_t>(KernargPreloadCount, F.arg_size());
} else {
// Defaults with no CL option.
- if (ST.hasGFX940Insts())
+ if (ST.defaultEnabledKernargPreload())
PreloadCount =
getMaxNumPreloadArgs(F, F.getParent()->getDataLayout(), TM);
else
PreloadCount = 0;
}
- for (unsigned I = 0; I < PreloadCount; ++I) {
+ for (size_t I = 0; I < PreloadCount; ++I) {
Argument &Arg = *F.getArg(I);
// Check for incompatible attributes.
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e6b7342d5fffcf..959644fe29c79b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1348,6 +1348,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if preloading kernel arguments is supported.
bool hasKernargPreload() const { return KernargPreload; }
+ // \returns true if target enables preloading kernel arguments by default.
+ bool defaultEnabledKernargPreload() const { return hasGFX940Insts(); }
+
// \returns true if the target has split barriers feature
bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
>From fe82ef33b6e648e7a39ce73117b7fb0754290b22 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Fri, 4 Oct 2024 17:41:50 -0700
Subject: [PATCH 3/3] Fix types.
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 5ace7cc700bcef..c53ea6757fbb38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1015,8 +1015,7 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
static unsigned getMaxNumPreloadArgs(const Function &F, const DataLayout &DL,
- const TargetMachine &TM) {
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const GCNSubtarget &ST) {
unsigned Offset = 0;
unsigned ArgsToPreload = 0;
for (const auto &Arg : F.args()) {
@@ -1051,7 +1050,7 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
// Defaults with no CL option.
if (ST.defaultEnabledKernargPreload())
PreloadCount =
- getMaxNumPreloadArgs(F, F.getParent()->getDataLayout(), TM);
+ getMaxNumPreloadArgs(F, F.getParent()->getDataLayout(), ST);
else
PreloadCount = 0;
}
More information about the llvm-commits
mailing list