[llvm] [AMDGPU] Enable kernarg preloading by default on gfx940 (PR #110691)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 1 08:43:03 PDT 2024
https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/110691
>From 564d80be93c583c0017b391c8fea29614bda1c46 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 1 Oct 2024 07:22:38 -0700
Subject: [PATCH] [AMDGPU] Enable kernarg preloading by default on gfx940
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 45 +++++-
.../AMDGPU/preload-kernargs-IR-lowering.ll | 128 +++++++++---------
.../AMDGPU/preload-kernargs-inreg-hints.ll | 2 +-
3 files changed, 106 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 687a7339da379d..e4ecbce8811069 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1014,12 +1014,49 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
+static unsigned getMaxNumPreloadArgs(const Function &F, const DataLayout &DL,
+ const TargetMachine &TM) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ unsigned Offset = 0;
+ unsigned ArgsToPreload = 0;
+ for (const auto &Arg : F.args()) {
+ if (Arg.hasByRefAttr())
+ break;
+
+ Type *Ty = Arg.getType();
+ Align ArgAlign = DL.getABITypeAlign(Ty);
+ auto Size = DL.getTypeAllocSize(Ty);
+ Offset = alignTo(Offset, ArgAlign);
+ if (((Offset + Size) / 4) > ST.getMaxNumUserSGPRs())
+ break;
+
+ Offset += Size;
+ ArgsToPreload++;
+ }
+
+ return ArgsToPreload;
+}
+
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- for (unsigned I = 0;
- I < F.arg_size() &&
- I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
- ++I) {
+ if (!ST.hasKernargPreload())
+ return;
+
+ // Enable kernarg preloading by default on GFX940+.
+ size_t PreloadCount;
+ if (KernargPreloadCount.getNumOccurrences() > 0) {
+ // Override default behavior if CL option is present.
+ PreloadCount = std::min<size_t>(KernargPreloadCount, F.arg_size());
+ } else {
+ // Defaults with no CL option.
+ if (ST.hasGFX940Insts())
+ PreloadCount =
+ getMaxNumPreloadArgs(F, F.getParent()->getDataLayout(), TM);
+ else
+ PreloadCount = 0;
+ }
+
+ for (unsigned I = 0; I < PreloadCount; ++I) {
Argument &Arg = *F.getArg(I);
// Check for incompatible attributes.
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
index ab0fb7584d50ce..3a5e72b0408bf7 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=0 -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=PRELOAD-DEFAULT %s
define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
@@ -32,12 +32,12 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %i
; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
store i32 %load, ptr addrspace(1) %out
@@ -88,14 +88,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -184,20 +184,20 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i
; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
-; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
-; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
+; PRELOAD-DEFAULT-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -254,14 +254,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -312,14 +312,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
+; PRELOAD-DEFAULT-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -385,16 +385,16 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg
; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
-; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
+; PRELOAD-DEFAULT-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-DEFAULT-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-DEFAULT-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-DEFAULT-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
+; PRELOAD-DEFAULT-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -450,14 +450,14 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0,
; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT: ret void
;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
-; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
-; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-DEFAULT-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
+; PRELOAD-DEFAULT-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-DEFAULT-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-DEFAULT-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-DEFAULT-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
+; PRELOAD-DEFAULT-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; PRELOAD-DEFAULT-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-DEFAULT-NEXT: ret void
;
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
index 20edbd6c0d0fa6..bb08aa18cd8dea 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
@@ -95,7 +95,7 @@ define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x floa
; PRELOAD-16-NEXT: ret void
;
; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
+; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 inreg [[TMP16:%.*]], i32 inreg [[TMP17:%.*]]) #[[ATTR0]] {
; PRELOAD-20-NEXT: ret void
;
ret void
More information about the llvm-commits
mailing list