[llvm] [AMDGPU] Move kernarg preload logic to AMDGPU Attributor (PR #123547)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 31 09:30:07 PST 2025
https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/123547
>From 6d27ee06b8901cc255ccba846af29be59d6d3e33 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Sun, 19 Jan 2025 15:44:12 -0800
Subject: [PATCH 1/3] [AMDGPU] Pre-commit tests for preload kernarg move to
attributor
NFC. Since the handling will be moved to the middlend the tests need to
be reorganized. The IR tests are expanded since end-to-end testing is no
longer baked in.
---
.../preload-implicit-kernargs-IR-lowering.ll | 712 ++++++-
.../AMDGPU/preload-implicit-kernargs.ll | 593 ++----
.../AMDGPU/preload-kernargs-IR-lowering.ll | 1642 +++++++++++++----
.../AMDGPU/preload-kernargs-inreg-hints.ll | 263 ---
4 files changed, 2073 insertions(+), 1137 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
index aeb7faade47150..180f01257f1f6d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
@@ -1,20 +1,72 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s
+
+define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1) %out, { i32, i32 } inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
@@ -27,9 +79,34 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] {
+define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %out, i32 inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512 inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
@@ -38,8 +115,8 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
@@ -52,31 +129,335 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
ret void
}
-define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
+define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg, i32) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
+; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
+; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i64, ptr addrspace(4) %imp_arg_ptr
+ store i64 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
+; NO-PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
+; PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i16, ptr addrspace(4) %imp_arg_ptr
+ store i16 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Y]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Z]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[VAL_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[VAL_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[VAL:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[VAL]] to i32
+; PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[_HIDDEN_BLOCK_COUNT_X]], [[EXT]]
+; PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ %ext = zext i8 %val to i32
+ %add = add i32 %load, %ext
+ store i32 %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0
+; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4
+; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_Y]], align 4
+; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
+; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_Z]], align 4
+; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD_X]], i32 0
+; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[LOAD_Y]], i32 1
+; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[LOAD_Z]], i32 2
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0
+; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4
+; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_Y]], align 4
+; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
+; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_Z]], align 4
+; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[_HIDDEN_BLOCK_COUNT_X]], i32 0
+; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[_HIDDEN_BLOCK_COUNT_Y]], i32 1
+; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[_HIDDEN_BLOCK_COUNT_Z]], i32 2
+; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+ %load_x = load i32, ptr addrspace(4) %gep_x
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load_y = load i32, ptr addrspace(4) %gep_y
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %load_z = load i32, ptr addrspace(4) %gep_z
+ %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
-; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32
; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-NEXT: ret void
;
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32
+; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32
+; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
%load = load i16, ptr addrspace(4) %gep
%conv = zext i16 %load to i32
store i32 %conv, ptr addrspace(1) %out
@@ -84,8 +465,8 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
@@ -105,8 +486,8 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] {
; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
@@ -141,74 +522,206 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
-; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
-; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_X]] to i32
+; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-NEXT: ret void
;
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i64, ptr addrspace(4) %imp_arg_ptr
- store i64 %load, ptr addrspace(1) %out
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset(
-; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Y_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOADREMAINDER_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
-; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
-; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Y]] to i32
+; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-NEXT: ret void
;
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
- %load = load i32, ptr addrspace(4) %gep
- store i32 %load, ptr addrspace(1) %out
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32
+; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
+; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
+; NO-PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32
+; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20
+; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
+; NO-PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32
+; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2
+; NO-PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32
+; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0
+; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1
+; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
+; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
+; PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[_HIDDEN_REMAINDER_X]] to i32
+; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20
+; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
+; PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Y]] to i32
+; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2
+; PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32
+; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0
+; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1
+; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2
+; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+ %load_x = load i16, ptr addrspace(4) %gep_x
+ %conv_x = zext i16 %load_x to i32
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+ %load_y = load i16, ptr addrspace(4) %gep_y
+ %conv_y = zext i16 %load_y to i32
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load_z = load i16, ptr addrspace(4) %gep_z
+ %conv_z = zext i16 %load_z to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32
+; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) %out, i192 %t0, i32 %t1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i192 [[T0:%.*]], i32 [[T1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i192 inreg [[T0:%.*]], i32 inreg [[T1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-NEXT: ret void
;
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -217,6 +730,57 @@ define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1)
ret void
}
-;.
-; NO-PRELOAD: [[META0]] = !{}
-;.
+define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
+; NO-PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
+; NO-PRELOAD-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[GEP0]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i16, ptr addrspace(4) [[GEP1]], align 2
+; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(4) [[GEP2]], align 2
+; NO-PRELOAD-NEXT: [[CONV1:%.*]] = zext i16 [[LOAD1]] to i32
+; NO-PRELOAD-NEXT: [[CONV2:%.*]] = zext i16 [[LOAD2]] to i32
+; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD0]], i32 0
+; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1
+; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
+; PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
+; PRELOAD-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[GEP0]], align 4
+; PRELOAD-NEXT: [[LOAD1:%.*]] = load i16, ptr addrspace(4) [[GEP1]], align 2
+; PRELOAD-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(4) [[GEP2]], align 2
+; PRELOAD-NEXT: [[CONV1:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32
+; PRELOAD-NEXT: [[CONV2:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32
+; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[_HIDDEN_BLOCK_COUNT_Z]], i32 0
+; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1
+; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2
+; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+ %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load0 = load i32, ptr addrspace(4) %gep0
+ %load1 = load i16, ptr addrspace(4) %gep1
+ %load2 = load i16, ptr addrspace(4) %gep2
+ %conv1 = zext i16 %load1 to i32
+ %conv2 = zext i16 %load2 to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 31beb7a3cce248..7c667027bf5429 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,8 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
-define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
; GFX940-LABEL: preload_block_count_x:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -16,27 +15,13 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB0_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4
+ store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 {
+define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg %0, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
; GFX940-LABEL: preload_unused_arg_block_count_x:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -51,60 +36,30 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
; GFX940-NEXT: v_mov_b32_e32 v1, s6
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_unused_arg_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB1_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4
+ store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) {
+define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg %0, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
; GFX940-LABEL: no_free_sgprs_block_count_x:
; GFX940: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_branch .LBB2_0
; GFX940-NEXT: .p2align 8
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: .LBB2_0:
-; GFX940-NEXT: s_load_dword s0, s[4:5], 0x28
; GFX940-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v1, s12
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: no_free_sgprs_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB2_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x28
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4
+ store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4
ret void
}
@@ -118,26 +73,13 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: no_inreg_block_count_x:
-; GFX90a: ; %bb.0:
-; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4
+ store i32 %load, ptr addrspace(1) %out, align 4
ret void
}
-; Implicit arg preloading is currently restricted to cases where all explicit
-; args are inreg (preloaded).
-
-define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 {
+define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg %0) #0 {
; GFX940-LABEL: mixed_inreg_block_count_x:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x10
@@ -147,19 +89,9 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: mixed_inreg_block_count_x:
-; GFX90a: ; %bb.0:
-; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4
+ store i32 %load, ptr addrspace(1) %out, align 4
ret void
}
@@ -178,24 +110,9 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: incorrect_type_i64_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB5_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB5_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i64, ptr addrspace(4) %imp_arg_ptr
- store i64 %load, ptr addrspace(1) %out
+ %load = load i64, ptr addrspace(4) %imp_arg_ptr, align 8
+ store i64 %load, ptr addrspace(1) %out, align 8
ret void
}
@@ -214,28 +131,13 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: incorrect_type_i16_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB6_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB6_0:
-; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i16, ptr addrspace(4) %imp_arg_ptr
- store i16 %load, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %imp_arg_ptr, align 2
+ store i16 %load, ptr addrspace(1) %out, align 2
ret void
}
-define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y) #0 {
; GFX940-LABEL: preload_block_count_y:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -249,24 +151,10 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
; GFX940-NEXT: v_mov_b32_e32 v1, s5
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_count_y:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB7_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB7_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
- %load = load i32, ptr addrspace(4) %gep
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %gep, align 4
+ store i32 %_hidden_block_count_y, ptr addrspace(1) %out, align 4
ret void
}
@@ -286,30 +174,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: random_incorrect_offset:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB8_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB8_0:
-; GFX90a-NEXT: s_mov_b32 s0, 8
-; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
- %load = load i32, ptr addrspace(4) %gep
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %gep, align 4
+ store i32 %load, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z) #0 {
; GFX940-LABEL: preload_block_count_z:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -324,29 +196,14 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
; GFX940-NEXT: v_mov_b32_e32 v1, s6
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_count_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB9_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
- %load = load i32, ptr addrspace(4) %gep
- store i32 %load, ptr addrspace(1) %out
+ %load = load i32, ptr addrspace(4) %gep, align 4
+ store i32 %_hidden_block_count_z, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 {
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -363,32 +220,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB10_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT: s_add_i32 s0, s10, s0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4
%ext = zext i8 %val to i32
- %add = add i32 %load, %ext
- store i32 %add, ptr addrspace(1) %out
+ %add = add i32 %_hidden_block_count_x, %ext
+ store i32 %add, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z) #0 {
; GFX940-LABEL: preload_block_count_xyz:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -405,38 +245,21 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
; GFX940-NEXT: v_mov_b32_e32 v2, s6
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_count_xyz:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB11_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
- %load_x = load i32, ptr addrspace(4) %gep_x
+ %load_x = load i32, ptr addrspace(4) %gep_x, align 4
%gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
- %load_y = load i32, ptr addrspace(4) %gep_y
+ %load_y = load i32, ptr addrspace(4) %gep_y, align 4
%gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
- %load_z = load i32, ptr addrspace(4) %gep_z
- %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0
- %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
- %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
- store <3 x i32> %ins.2, ptr addrspace(1) %out
+ %load_z = load i32, ptr addrspace(4) %gep_z, align 4
+ %ins.0 = insertelement <3 x i32> poison, i32 %_hidden_block_count_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %_hidden_block_count_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %_hidden_block_count_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16
ret void
}
-define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x) #0 {
; GFX940-LABEL: preload_workgroup_size_x:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -451,30 +274,15 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_workgroup_size_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB12_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_group_size_x to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y) #0 {
; GFX940-LABEL: preload_workgroup_size_y:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -489,30 +297,15 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_workgroup_size_y:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB13_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_group_size_y to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z) #0 {
; GFX940-LABEL: preload_workgroup_size_z:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -528,31 +321,15 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_workgroup_size_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB14_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_group_size_z to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z) #0 {
; GFX940-LABEL: preload_workgroup_size_xyz:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -572,44 +349,24 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_workgroup_size_xyz:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB15_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
-; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
- %load_x = load i16, ptr addrspace(4) %gep_x
- %conv_x = zext i16 %load_x to i32
+ %load_x = load i16, ptr addrspace(4) %gep_x, align 2
+ %conv_x = zext i16 %_hidden_group_size_x to i32
%gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
- %load_y = load i16, ptr addrspace(4) %gep_y
- %conv_y = zext i16 %load_y to i32
+ %load_y = load i16, ptr addrspace(4) %gep_y, align 2
+ %conv_y = zext i16 %_hidden_group_size_y to i32
%gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
- %load_z = load i16, ptr addrspace(4) %gep_z
- %conv_z = zext i16 %load_z to i32
- %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
- %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
- %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
- store <3 x i32> %ins.2, ptr addrspace(1) %out
+ %load_z = load i16, ptr addrspace(4) %gep_z, align 2
+ %conv_z = zext i16 %_hidden_group_size_z to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16
ret void
}
-define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x) #0 {
; GFX940-LABEL: preload_remainder_x:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -625,31 +382,15 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_remainder_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB16_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_remainder_x to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y) #0 {
; GFX940-LABEL: preloadremainder_y:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -665,31 +406,15 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preloadremainder_y:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB17_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_remainder_y to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
; GFX940-LABEL: preloadremainder_z:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -705,31 +430,15 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preloadremainder_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB18_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_remainder_z to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
; GFX940-LABEL: preloadremainder_xyz:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -749,47 +458,29 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preloadremainder_xyz:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB19_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
-; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
- %load_x = load i16, ptr addrspace(4) %gep_x
- %conv_x = zext i16 %load_x to i32
+ %load_x = load i16, ptr addrspace(4) %gep_x, align 2
+ %conv_x = zext i16 %_hidden_remainder_x to i32
%gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
- %load_y = load i16, ptr addrspace(4) %gep_y
- %conv_y = zext i16 %load_y to i32
+ %load_y = load i16, ptr addrspace(4) %gep_y, align 2
+ %conv_y = zext i16 %_hidden_remainder_y to i32
%gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
- %load_z = load i16, ptr addrspace(4) %gep_z
- %conv_z = zext i16 %load_z to i32
- %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
- %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
- %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
- store <3 x i32> %ins.2, ptr addrspace(1) %out
+ %load_z = load i16, ptr addrspace(4) %gep_z, align 2
+ %conv_z = zext i16 %_hidden_remainder_z to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16
ret void
}
-define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) {
+define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out, i128 inreg, i64 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
; GFX940-LABEL: no_free_sgprs_preloadremainder_z:
; GFX940: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX940-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x28
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_branch .LBB20_0
; GFX940-NEXT: .p2align 8
@@ -798,74 +489,41 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; GFX940-NEXT: s_lshr_b32 s0, s15, 16
; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB20_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB20_0:
-; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x1c
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
- %load = load i16, ptr addrspace(4) %gep
- %conv = zext i16 %load to i32
- store i32 %conv, ptr addrspace(1) %out
+ %load = load i16, ptr addrspace(4) %gep, align 2
+ %conv = zext i16 %_hidden_remainder_z to i32
+ store i32 %conv, ptr addrspace(1) %out, align 4
ret void
}
-; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
+; This should use s15 for the hidden argument.
-define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
-; GFX940-LABEL: preload_block_max_user_sgprs:
+define amdgpu_kernel void @preload_block_y_max_user_sgprs(ptr addrspace(1) inreg %out, i256 inreg, i64 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y) #0 {
+; GFX940-LABEL: preload_block_y_max_user_sgprs:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28
+; GFX940-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x28
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_branch .LBB21_0
; GFX940-NEXT: .p2align 8
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: .LBB21_0:
; GFX940-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NEXT: v_mov_b32_e32 v1, s12
+; GFX940-NEXT: v_mov_b32_e32 v1, s15
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_max_user_sgprs:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB21_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB21_0:
-; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load = load i32, ptr addrspace(4) %gep, align 4
+ store i32 %_hidden_block_count_y, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
+define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX940: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -884,39 +542,20 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
-;
-; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB22_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
%gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
- %load0 = load i32, ptr addrspace(4) %gep0
- %load1 = load i16, ptr addrspace(4) %gep1
- %load2 = load i16, ptr addrspace(4) %gep2
- %conv1 = zext i16 %load1 to i32
- %conv2 = zext i16 %load2 to i32
- %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0
- %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
- %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
- store <3 x i32> %ins.2, ptr addrspace(1) %out
+ %load0 = load i32, ptr addrspace(4) %gep0, align 4
+ %load1 = load i16, ptr addrspace(4) %gep1, align 2
+ %load2 = load i16, ptr addrspace(4) %gep2, align 2
+ %conv1 = zext i16 %_hidden_group_size_z to i32
+ %conv2 = zext i16 %_hidden_remainder_z to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %_hidden_block_count_z, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16
ret void
}
-attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx940" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
index ab0fb7584d50ce..658ef33f74935d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
@@ -1,60 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=2 -S < %s | FileCheck -check-prefix=PRELOAD-2 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD-ALL %s
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
+define amdgpu_kernel void @ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
-; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
-; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
-; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-2-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-ALL-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
store i32 %load, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
+define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
+; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
@@ -62,40 +52,27 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i
; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
-; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
-; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
-; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
-; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
-; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
+; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -104,25 +81,25 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
+define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
+; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
+; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 40
; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
+; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 48
; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
+; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56
; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
@@ -134,70 +111,45 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i
; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
-; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16
-; PRELOAD-1-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
-; PRELOAD-1-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
-; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
-; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
-; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
-; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
-; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
-; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
-; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
-; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
-; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
-; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
-; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
-; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
-; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
+; PRELOAD-2-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
+; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 40
+; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 48
+; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56
+; PRELOAD-2-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-2-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
+; PRELOAD-2-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56
+; PRELOAD-ALL-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
+; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -210,19 +162,17 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i
ret void
}
-; Preload args with inreg in the NO-PRELOAD case.
-
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
-; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0
+define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24
+; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
@@ -230,38 +180,27 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad
; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
-; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24
-; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
-; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 24
+; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -270,56 +209,45 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad
ret void
}
-; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument.
-
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
-; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8
+define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24
+; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
-; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24
-; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
-; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
-; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 24
+; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -328,20 +256,20 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se
ret void
}
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
+define amdgpu_kernel void @i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel
; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
+; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 32
; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
@@ -351,50 +279,33 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg
; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
-; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16
-; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
-; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
-; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
-; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
-; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
-; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel
+; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 24
+; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 32
+; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel
+; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
+; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
%load1 = load i32, ptr addrspace(1) %in1
@@ -405,20 +316,18 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg
ret void
}
-; In this case both i16 args with be preloaded into the first SGPR.
-
-define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
+define amdgpu_kernel void @i16_i16_ptr1_kernel(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel
; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32
@@ -426,38 +335,25 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0,
; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
-; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-1-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
-; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; PRELOAD-1-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
-; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8
-; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32
-; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
-; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
-; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
-; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
-; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-8-NEXT: ret void
+; PRELOAD-2-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel
+; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel
+; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
+; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
;
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -466,4 +362,1104 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0,
ret void
}
-attributes #0 = { nounwind }
+define amdgpu_kernel void @ptr1_i8_kernel(ptr addrspace(1) %out, i8 %arg0) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %ext = zext i8 %arg0 to i32
+ store i32 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i8_zeroext_kernel(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 zeroext [[ARG0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %ext = zext i8 %arg0 to i32
+ store i32 %ext, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_kernel(ptr addrspace(1) %out, i16 %arg0) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %ext = zext i16 %arg0 to i32
+ store i32 %ext, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i32_kernel(ptr addrspace(1) %out, i32 %arg0) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i32_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[ARG0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i32_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i32_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i32 %arg0, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @i32_ptr1_i32_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel
+; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0_LOAD]], [[ARG1_LOAD]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel
+; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1_LOAD]]
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel
+; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
+; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %add = add i32 %arg0, %arg1
+ store i32 %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_i16_kernel(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
+; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8
+; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
+; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
+; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %ext = zext i16 %arg0 to i32
+ %ext1 = zext i16 %arg1 to i32
+ %add = add i32 %ext, %ext1
+ store i32 %add, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v2i8_kernel(ptr addrspace(1) %out, <2 x i8> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x i8> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to <2 x i8>
+; NO-PRELOAD-NEXT: store <2 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <2 x i8> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_byref_i32_i32_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256
+; NO-PRELOAD-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260
+; NO-PRELOAD-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256
+; PRELOAD-2-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260
+; PRELOAD-2-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256
+; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260
+; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %in = load i32, ptr addrspace(4) %in.byref
+ store volatile i32 %in, ptr addrspace(1) %out, align 4
+ store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_byref_i32_i32_staggered_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256
+; NO-PRELOAD-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260
+; NO-PRELOAD-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256
+; PRELOAD-2-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260
+; PRELOAD-2-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256
+; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260
+; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %in = load i32, ptr addrspace(4) %in.byref
+ store volatile i32 %in, ptr addrspace(1) %out, align 4
+ store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8 x i32> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32
+; PRELOAD-2-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32
+; PRELOAD-ALL-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-ALL-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <8 x i32> %in, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3 x i16> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i16> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO-PRELOAD-NEXT: store <3 x i16> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <3 x i16> %in, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3 x i32> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO-PRELOAD-NEXT: store <3 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <3 x i32> %in, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3 x float> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x float> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO-PRELOAD-NEXT: store <3 x float> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <3 x float> %in, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5 x i8> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x i8> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <5 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <5 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <5 x i8> %in, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v5f64_kernel(ptr addrspace(1) nocapture %out, <5 x double> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64
+; PRELOAD-2-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64
+; PRELOAD-ALL-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-ALL-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <5 x double> %in, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v8i8_kernel(ptr addrspace(1) %out, <8 x i8> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i8> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <8 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <8 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <8 x i8> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i64_kernel(ptr addrspace(1) %out, i64 %a) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i64_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i64 [[A:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I64_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[A_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I64_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[A_LOAD:%.*]] = load i64, ptr addrspace(4) [[A_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store i64 [[A_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i64_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i64_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i64 %a, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_f64_kernel(ptr addrspace(1) %out, double %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_f64_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], double [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_F64_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_F64_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load double, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store double [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_f64_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_f64_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-NEXT: ret void
+;
+ store double %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_half_kernel(ptr addrspace(1) %out, half %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
+; NO-PRELOAD-NEXT: store half [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: ret void
+;
+ store half %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_bfloat_kernel(ptr addrspace(1) %out, bfloat %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], bfloat [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to bfloat
+; NO-PRELOAD-NEXT: store bfloat [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: ret void
+;
+ store bfloat %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v2bfloat_kernel(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x bfloat> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <2 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <2 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <2 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v3bfloat_kernel(ptr addrspace(1) %out, <3 x bfloat> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <3 x bfloat> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO-PRELOAD-NEXT: store <3 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <3 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v6bfloat_kernel(ptr addrspace(1) %out, <6 x bfloat> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <6 x bfloat> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <6 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <6 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <6 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_half_v7bfloat_kernel(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
+; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load <7 x bfloat>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 32
+; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store half [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store <7 x bfloat> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load <7 x bfloat>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 32
+; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: store <7 x bfloat> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
+; PRELOAD-ALL-NEXT: ret void
+;
+ store half %in, ptr addrspace(1) %out
+ store <7 x bfloat> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i1_kernel(ptr addrspace(1) %out, i1 %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i1_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i1 [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I1_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
+; NO-PRELOAD-NEXT: store i1 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 1
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i1 %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_fp128_kernel(ptr addrspace(1) %out, fp128 %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_fp128_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], fp128 [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_FP128_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_FP128_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load fp128, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store fp128 [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_fp128_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_fp128_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-ALL-NEXT: ret void
+;
+ store fp128 %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v7i8_kernel(ptr addrspace(1) %out, <7 x i8> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x i8> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <7 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <7 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <7 x i8> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_v7half_kernel(ptr addrspace(1) %out, <7 x half> %in) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7half_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x half> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <7 x half>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store <7 x half> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7half_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7half_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16
+; PRELOAD-ALL-NEXT: ret void
+;
+ store <7 x half> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 12
+; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store i32 [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 12
+; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: store i32 [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i16 %in, ptr addrspace(1) %out
+ store i32 %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_v3i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
+; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store <3 x i32> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
+; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: store <3 x i32> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i16 %in, ptr addrspace(1) %out
+ store <3 x i32> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_i16_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
+; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i16 %in, ptr addrspace(1) %out
+ store i16 %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_v2i8_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
+; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = bitcast i16 [[TMP5]] to <2 x i8>
+; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store <2 x i8> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
+; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = bitcast i16 [[TMP3]] to <2 x i8>
+; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-2-NEXT: store <2 x i8> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-ALL-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2
+; PRELOAD-ALL-NEXT: ret void
+;
+ store i16 %in, ptr addrspace(1) %out
+ store <2 x i8> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i32_ptr1_i32_staggered_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel
+; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 16
+; NO-PRELOAD-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0_LOAD]], [[ARG1_LOAD]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel
+; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 16
+; PRELOAD-2-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1_LOAD]]
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel
+; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
+; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %add = add i32 %arg0, %arg1
+ store i32 %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i8_i32_trailing_unused_kernel(ptr addrspace(1) %out, i8 %arg0, i32 %unused) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT]], i64 8
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: ret void
+;
+; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 inreg [[UNUSED:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: ret void
+;
+ %ext = zext i8 %arg0 to i32
+ store i32 %ext, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
deleted file mode 100644
index 20edbd6c0d0fa6..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
+++ /dev/null
@@ -1,263 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s
-
-define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
-; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
-; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
-; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
-; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
-; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_2(i32 %0, i64 %1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
-; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
-; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
-; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
-; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2
-; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_4(i32 %0, i64 %1, <2 x float> %2, ptr %3) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
-; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
-; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
-; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
-; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4
-; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x float> %2, ptr %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18
-; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define void @test_preload_hint_non_kernel_2(i32 %0, i64 %1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
-; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
-; PRELOAD-1-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
-; PRELOAD-3-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
-; PRELOAD-16-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2
-; PRELOAD-20-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
-; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; NO-PRELOAD-NEXT: call void @func(ptr [[TMP0]])
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
-; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; PRELOAD-1-NEXT: call void @func(ptr [[TMP0]])
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
-; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; PRELOAD-3-NEXT: call void @func(ptr [[TMP0]])
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
-; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; PRELOAD-16-NEXT: call void @func(ptr [[TMP0]])
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func
-; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; PRELOAD-20-NEXT: call void @func(ptr [[TMP0]])
-; PRELOAD-20-NEXT: ret void
-;
- call void @func(ptr %0)
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
-; PRELOAD-20-NEXT: ret void
-;
- call void @llvm.amdgcn.set.prio(i16 %0)
- ret void
-}
-
-define spir_kernel void @test_preload_hint_kernel_1_spir_cc(ptr %0) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
-; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
-; PRELOAD-1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
-; PRELOAD-3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
-; PRELOAD-16-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc
-; PRELOAD-20-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, i64 %1) #0 {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
-; NO-PRELOAD-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
-; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
-; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
-; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting
-; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) {
-; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
-; NO-PRELOAD-NEXT: ret void
-;
-; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
-; PRELOAD-1-NEXT: ret void
-;
-; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
-; PRELOAD-3-NEXT: ret void
-;
-; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
-; PRELOAD-16-NEXT: ret void
-;
-; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
-; PRELOAD-20-NEXT: ret void
-;
- ret void
-}
-
-declare void @func(ptr) #0
-declare void @llvm.amdgcn.set.prio(i16)
-
-attributes #0 = { nounwind }
>From 1af8cfb2224fce260176f0e71035ccd61168d69b Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Fri, 13 Dec 2024 16:24:14 -0800
Subject: [PATCH 2/3] [AMDGPU] Move kernarg preload logic to AMDGPU Attributor
Besides the changes listed below everything works the same as it did
when this code was in AMDGPULowerKernelArguments.
There is a refactoring of the free user SGPR tracking to make it
simplified and more accurate. We don't actually care which SGPRs hold
which arguments before ISel so specific tracking of the number of free
registers is removed. In one case this leads to one extra argument being
preloaded in a test. ISel correctly identifies this opportunity even
when the IR pass previously missed it. Even though inreg is meant to act
as a hint the coupling between the attribute and whether an argument is
actually preloaded should be equivalent now, although ISel always makes
the final determination.
Since we are no longer handling this in AMDGPULowerKernelArguments that
pass must rely on the inreg attribute to determine whether to leave
arguments as is. This leads to some test changes.
This lowering is moved out of the llc pipeline which requires test
updates.
Cloned function declarations are removed when kernel signatures are
modified to preload hidden arguments.
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 283 ++++++-
.../AMDGPU/AMDGPULowerKernelArguments.cpp | 255 +------
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 8 +-
...gcn.raw.ptr.buffer.atomic.fadd-with-ret.ll | 4 +-
.../AMDGPU/llvm.amdgcn.intersect_ray.ll | 8 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 4 +-
.../preload-implicit-kernargs-IR-lowering.ll | 158 +---
.../AMDGPU/preload-kernargs-IR-lowering.ll | 721 +++---------------
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 28 +-
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 8 +-
10 files changed, 456 insertions(+), 1021 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 546db318c17d53..d7c700f40c824b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -14,7 +14,9 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
@@ -144,6 +146,213 @@ static bool funcRequiresHostcallPtr(const Function &F) {
}
namespace {
+
+class PreloadKernelArgInfo {
+private:
+ Function &F;
+ const GCNSubtarget &ST;
+ unsigned NumFreeUserSGPRs;
+
+ enum HiddenArg : unsigned {
+ HIDDEN_BLOCK_COUNT_X,
+ HIDDEN_BLOCK_COUNT_Y,
+ HIDDEN_BLOCK_COUNT_Z,
+ HIDDEN_GROUP_SIZE_X,
+ HIDDEN_GROUP_SIZE_Y,
+ HIDDEN_GROUP_SIZE_Z,
+ HIDDEN_REMAINDER_X,
+ HIDDEN_REMAINDER_Y,
+ HIDDEN_REMAINDER_Z,
+ END_HIDDEN_ARGS
+ };
+
+ // Stores information about a specific hidden argument.
+ struct HiddenArgInfo {
+ // Offset in bytes from the location in the kernearg segment pointed to by
+ // the implicitarg pointer.
+ uint8_t Offset;
+ // The size of the hidden argument in bytes.
+ uint8_t Size;
+ // The name of the hidden argument in the kernel signature.
+ const char *Name;
+ };
+
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
+ {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
+ {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
+ {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
+ {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
+ {22, 2, "_hidden_remainder_z"}};
+
+ static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
+ for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
+ if (HiddenArgs[I].Offset == Offset)
+ return static_cast<HiddenArg>(I);
+
+ return END_HIDDEN_ARGS;
+ }
+
+ static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
+ if (HA < END_HIDDEN_ARGS)
+ return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
+
+ llvm_unreachable("Unexpected hidden argument.");
+ }
+
+ static const char *getHiddenArgName(HiddenArg HA) {
+ if (HA < END_HIDDEN_ARGS) {
+ return HiddenArgs[HA].Name;
+ }
+ llvm_unreachable("Unexpected hidden argument.");
+ }
+
+ // Clones the function after adding implicit arguments to the argument list
+ // and returns the new updated function. Preloaded implicit arguments are
+ // added up to and including the last one that will be preloaded, indicated by
+ // LastPreloadIndex. Currently preloading is only performed on the totality of
+ // sequential data from the kernarg segment including implicit (hidden)
+ // arguments. This means that all arguments up to the last preloaded argument
+ // will also be preloaded even if that data is unused.
+ Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
+ FunctionType *FT = F.getFunctionType();
+ LLVMContext &Ctx = F.getParent()->getContext();
+ SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
+ for (unsigned I = 0; I <= LastPreloadIndex; ++I)
+ FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
+
+ FunctionType *NFT =
+ FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
+ Function *NF =
+ Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
+
+ NF->copyAttributesFrom(&F);
+ NF->copyMetadata(&F, 0);
+ NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
+
+ F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+ NF->takeName(&F);
+ NF->splice(NF->begin(), &F);
+
+ Function::arg_iterator NFArg = NF->arg_begin();
+ for (Argument &Arg : F.args()) {
+ Arg.replaceAllUsesWith(&*NFArg);
+ NFArg->takeName(&Arg);
+ ++NFArg;
+ }
+
+ AttrBuilder AB(Ctx);
+ AB.addAttribute(Attribute::InReg);
+ AB.addAttribute("amdgpu-hidden-argument");
+ AttributeList AL = NF->getAttributes();
+ for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
+ AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
+ NFArg++->setName(getHiddenArgName(HiddenArg(I)));
+ }
+
+ NF->setAttributes(AL);
+ F.replaceAllUsesWith(NF);
+
+ return NF;
+ }
+
+public:
+ PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
+ setInitialFreeUserSGPRsCount();
+ }
+
+ // Returns the maximum number of user SGPRs that we have available to preload
+ // arguments.
+ void setInitialFreeUserSGPRsCount() {
+ GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
+ NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
+ }
+
+ bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
+ return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
+ }
+
+ // Try to allocate SGPRs to preload hidden kernel arguments.
+ void
+ tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
+ SmallVectorImpl<Function *> &FunctionsToErase) {
+ Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
+ if (!ImplicitArgPtr)
+ return;
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ // Pair is the load and the load offset.
+ SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+ for (auto *U : ImplicitArgPtr->users()) {
+ Instruction *CI = dyn_cast<Instruction>(U);
+ if (!CI || CI->getParent()->getParent() != &F)
+ continue;
+
+ for (auto *U : CI->users()) {
+ int64_t Offset = 0;
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+ if (!Load) {
+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+ continue;
+
+ Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+ }
+
+ if (!Load || !Load->isSimple())
+ continue;
+
+ // FIXME: Expand handle merged loads.
+ LLVMContext &Ctx = F.getParent()->getContext();
+ Type *LoadTy = Load->getType();
+ HiddenArg HA = getHiddenArgFromOffset(Offset);
+ if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
+ continue;
+
+ ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+ }
+ }
+
+ if (ImplicitArgLoads.empty())
+ return;
+
+ // Allocate loads in order of offset. We need to be sure that the implicit
+ // argument can actually be preloaded.
+ std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
+
+ // If we fail to preload any implicit argument we know we don't have SGPRs
+ // to preload any subsequent ones with larger offsets. Find the first
+ // argument that we cannot preload.
+ auto *PreloadEnd =
+ std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+ [&](const std::pair<LoadInst *, unsigned> &Load) {
+ unsigned LoadSize =
+ DL.getTypeStoreSize(Load.first->getType());
+ unsigned LoadOffset = Load.second;
+ if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
+ ImplicitArgsBaseOffset))
+ return true;
+
+ return false;
+ });
+
+ if (PreloadEnd == ImplicitArgLoads.begin())
+ return;
+
+ unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
+ Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
+ assert(NF);
+ FunctionsToErase.push_back(&F);
+ for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
+ LoadInst *LoadInst = I->first;
+ unsigned LoadOffset = I->second;
+ unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
+ unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
+ Argument *Arg = NF->getArg(Index);
+ LoadInst->replaceAllUsesWith(Arg);
+ }
+ }
+};
+
class AMDGPUInformationCache : public InformationCache {
public:
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
@@ -1314,19 +1523,64 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
-static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- for (unsigned I = 0;
- I < F.arg_size() &&
- I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
- ++I) {
- Argument &Arg = *F.getArg(I);
- // Check for incompatible attributes.
- if (Arg.hasByRefAttr() || Arg.hasNestAttr())
- break;
+static void markKernelArgsAsInreg(SetVector<Function *> &Functions,
+ TargetMachine &TM) {
+ SmallVector<Function *, 4> FunctionsToErase;
+ for (auto *F : Functions) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
+ if (!ST.hasKernargPreload() ||
+ F->getCallingConv() != CallingConv::AMDGPU_KERNEL || F->arg_empty())
+ continue;
+
+ PreloadKernelArgInfo PreloadInfo(*F, ST);
+ uint64_t ExplicitArgOffset = 0;
+ const DataLayout &DL = F->getDataLayout();
+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
+ unsigned NumPreloadsRequested = KernargPreloadCount;
+ unsigned NumPreloadedExplicitArgs = 0;
+ for (Argument &Arg : F->args()) {
+ // Avoid incompatible attributes and guard against running this pass
+ // twice.
+ if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
+ Arg.hasAttribute("amdgpu-hidden-argument"))
+ break;
+
+ // Inreg may be pre-existing on some arguments, try to preload these.
+ if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
+ break;
+
+ // FIXME: Preload aggregates.
+ if (Arg.getType()->isAggregateType())
+ break;
+
+ Type *ArgTy = Arg.getType();
+ Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
+ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
+ if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
+ break;
+
+ Arg.addAttr(Attribute::InReg);
+ NumPreloadedExplicitArgs++;
+ if (NumPreloadsRequested > 0)
+ NumPreloadsRequested--;
+ }
- Arg.addAttr(Attribute::InReg);
+ // Only try preloading hidden arguments if we can successfully preload the
+ // last explicit argument.
+ if (NumPreloadedExplicitArgs == F->arg_size()) {
+ uint64_t ImplicitArgsBaseOffset =
+ alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
+ BaseOffset;
+ PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
+ FunctionsToErase);
+ }
}
+
+ // Erase cloned functions if we needed to update the kernel signature to
+ // support preloading hidden kernel arguments.
+ for (auto *F : FunctionsToErase)
+ F->eraseFromParent();
}
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
@@ -1378,8 +1632,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
if (!AMDGPU::isEntryFunctionCC(CC)) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
- } else if (CC == CallingConv::AMDGPU_KERNEL) {
- addPreloadKernArgHint(*F, TM);
}
for (auto &I : instructions(F)) {
@@ -1400,6 +1652,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
}
ChangeStatus Change = A.run();
+
+ // Mark kernel arguments with 'inreg' attribute to indicate that they should
+ // be preloaded into SGPRs.
+ markKernelArgsAsInreg(Functions, TM);
+
return Change == ChangeStatus::CHANGED;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index e9d009baa20af2..7f6c5b4b476d8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -27,230 +27,6 @@ using namespace llvm;
namespace {
-class PreloadKernelArgInfo {
-private:
- Function &F;
- const GCNSubtarget &ST;
- unsigned NumFreeUserSGPRs;
-
- enum HiddenArg : unsigned {
- HIDDEN_BLOCK_COUNT_X,
- HIDDEN_BLOCK_COUNT_Y,
- HIDDEN_BLOCK_COUNT_Z,
- HIDDEN_GROUP_SIZE_X,
- HIDDEN_GROUP_SIZE_Y,
- HIDDEN_GROUP_SIZE_Z,
- HIDDEN_REMAINDER_X,
- HIDDEN_REMAINDER_Y,
- HIDDEN_REMAINDER_Z,
- END_HIDDEN_ARGS
- };
-
- // Stores information about a specific hidden argument.
- struct HiddenArgInfo {
- // Offset in bytes from the location in the kernearg segment pointed to by
- // the implicitarg pointer.
- uint8_t Offset;
- // The size of the hidden argument in bytes.
- uint8_t Size;
- // The name of the hidden argument in the kernel signature.
- const char *Name;
- };
-
- static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
- {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
- {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
- {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
- {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
- {22, 2, "_hidden_remainder_z"}};
-
- static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
- for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
- if (HiddenArgs[I].Offset == Offset)
- return static_cast<HiddenArg>(I);
-
- return END_HIDDEN_ARGS;
- }
-
- static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
- if (HA < END_HIDDEN_ARGS)
- return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
-
- llvm_unreachable("Unexpected hidden argument.");
- }
-
- static const char *getHiddenArgName(HiddenArg HA) {
- if (HA < END_HIDDEN_ARGS) {
- return HiddenArgs[HA].Name;
- }
- llvm_unreachable("Unexpected hidden argument.");
- }
-
- // Clones the function after adding implicit arguments to the argument list
- // and returns the new updated function. Preloaded implicit arguments are
- // added up to and including the last one that will be preloaded, indicated by
- // LastPreloadIndex. Currently preloading is only performed on the totality of
- // sequential data from the kernarg segment including implicit (hidden)
- // arguments. This means that all arguments up to the last preloaded argument
- // will also be preloaded even if that data is unused.
- Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
- FunctionType *FT = F.getFunctionType();
- LLVMContext &Ctx = F.getParent()->getContext();
- SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
- for (unsigned I = 0; I <= LastPreloadIndex; ++I)
- FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
-
- FunctionType *NFT =
- FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
- Function *NF =
- Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
-
- NF->copyAttributesFrom(&F);
- NF->copyMetadata(&F, 0);
- NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
-
- F.getParent()->getFunctionList().insert(F.getIterator(), NF);
- NF->takeName(&F);
- NF->splice(NF->begin(), &F);
-
- Function::arg_iterator NFArg = NF->arg_begin();
- for (Argument &Arg : F.args()) {
- Arg.replaceAllUsesWith(&*NFArg);
- NFArg->takeName(&Arg);
- ++NFArg;
- }
-
- AttrBuilder AB(Ctx);
- AB.addAttribute(Attribute::InReg);
- AB.addAttribute("amdgpu-hidden-argument");
- AttributeList AL = NF->getAttributes();
- for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
- AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
- NFArg++->setName(getHiddenArgName(HiddenArg(I)));
- }
-
- NF->setAttributes(AL);
- F.replaceAllUsesWith(NF);
- F.setCallingConv(CallingConv::C);
-
- return NF;
- }
-
-public:
- PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
- setInitialFreeUserSGPRsCount();
- }
-
- // Returns the maximum number of user SGPRs that we have available to preload
- // arguments.
- void setInitialFreeUserSGPRsCount() {
- GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
- NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
- }
-
- bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
- uint64_t LastExplicitArgOffset) {
- // Check if this argument may be loaded into the same register as the
- // previous argument.
- if (ArgOffset - LastExplicitArgOffset < 4 &&
- !isAligned(Align(4), ArgOffset))
- return true;
-
- // Pad SGPRs for kernarg alignment.
- ArgOffset = alignDown(ArgOffset, 4);
- unsigned Padding = ArgOffset - LastExplicitArgOffset;
- unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
- unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
- if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
- return false;
-
- NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
- return true;
- }
-
- // Try to allocate SGPRs to preload implicit kernel arguments.
- void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
- uint64_t LastExplicitArgOffset,
- IRBuilder<> &Builder) {
- Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
- F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
- if (!ImplicitArgPtr)
- return;
-
- const DataLayout &DL = F.getParent()->getDataLayout();
- // Pair is the load and the load offset.
- SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
- for (auto *U : ImplicitArgPtr->users()) {
- Instruction *CI = dyn_cast<Instruction>(U);
- if (!CI || CI->getParent()->getParent() != &F)
- continue;
-
- for (auto *U : CI->users()) {
- int64_t Offset = 0;
- auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
- if (!Load) {
- if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
- continue;
-
- Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
- }
-
- if (!Load || !Load->isSimple())
- continue;
-
- // FIXME: Expand to handle 64-bit implicit args and large merged loads.
- LLVMContext &Ctx = F.getParent()->getContext();
- Type *LoadTy = Load->getType();
- HiddenArg HA = getHiddenArgFromOffset(Offset);
- if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
- continue;
-
- ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
- }
- }
-
- if (ImplicitArgLoads.empty())
- return;
-
- // Allocate loads in order of offset. We need to be sure that the implicit
- // argument can actually be preloaded.
- std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
-
- // If we fail to preload any implicit argument we know we don't have SGPRs
- // to preload any subsequent ones with larger offsets. Find the first
- // argument that we cannot preload.
- auto *PreloadEnd = std::find_if(
- ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
- [&](const std::pair<LoadInst *, unsigned> &Load) {
- unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
- unsigned LoadOffset = Load.second;
- if (!tryAllocPreloadSGPRs(LoadSize,
- LoadOffset + ImplicitArgsBaseOffset,
- LastExplicitArgOffset))
- return true;
-
- LastExplicitArgOffset =
- ImplicitArgsBaseOffset + LoadOffset + LoadSize;
- return false;
- });
-
- if (PreloadEnd == ImplicitArgLoads.begin())
- return;
-
- unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
- Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
- assert(NF);
- for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
- LoadInst *LoadInst = I->first;
- unsigned LoadOffset = I->second;
- unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
- unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
- Argument *Arg = NF->getArg(Index);
- LoadInst->replaceAllUsesWith(Arg);
- }
- }
-};
-
class AMDGPULowerKernelArguments : public FunctionPass {
public:
static char ID;
@@ -310,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
uint64_t ExplicitArgOffset = 0;
- // Preloaded kernel arguments must be sequential.
- bool InPreloadSequence = true;
- PreloadKernelArgInfo PreloadInfo(F, ST);
-
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -324,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
- uint64_t LastExplicitArgOffset = ExplicitArgOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
- // Guard against the situation where hidden arguments have already been
- // lowered and added to the kernel function signiture, i.e. in a situation
- // where this pass has run twice.
- if (Arg.hasAttribute("amdgpu-hidden-argument"))
- break;
-
- // Try to preload this argument into user SGPRs.
- if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
- !Arg.getType()->isAggregateType())
- if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
- LastExplicitArgOffset))
- continue;
-
- InPreloadSequence = false;
-
- if (Arg.use_empty())
+ // Inreg arguments should be preloaded.
+ if (Arg.use_empty() || Arg.hasInRegAttr())
continue;
// If this is byval, the loads are already explicit in the function. We just
@@ -482,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
KernArgSegment->addRetAttr(
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
- if (InPreloadSequence) {
- uint64_t ImplicitArgsBaseOffset =
- alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
- BaseOffset;
- PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
- ExplicitArgOffset, Builder);
- }
-
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index b26ddbdd7a342e..5cba777959d8ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -625,7 +625,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
ret <4 x float> %r
}
-define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
@@ -739,7 +739,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
ret void
}
-define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
@@ -843,7 +843,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
@@ -925,7 +925,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll
index 798a3ee1d75fd1..076c12e0d5bc76 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll
@@ -8,7 +8,7 @@ declare <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr
; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn:
; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc
-define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 %soffset) {
+define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
main_body:
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
store float %ret, ptr undef
@@ -17,7 +17,7 @@ main_body:
; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn:
; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc
-define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
main_body:
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
store <2 x half> %ret, ptr undef
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 8af5db9f629083..513ffb38fe7f98 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -261,7 +261,7 @@ main_body:
; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.
-define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -425,7 +425,7 @@ main_body:
ret void
}
-define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -573,7 +573,7 @@ main_body:
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_clause 0x1
@@ -734,7 +734,7 @@ main_body:
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 7342c366799e9c..68bc20456be6a0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1486,7 +1486,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
ret void
}
-define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1583,7 +1583,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
ret void
}
-define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
index 180f01257f1f6d..c48ddbac43e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
@@ -1,26 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s
+
define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-NEXT: ret void
;
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -32,17 +27,13 @@ define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1)
define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1) %out, { i32, i32 } inreg) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
@@ -57,17 +48,13 @@ define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1)
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
@@ -82,17 +69,13 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %out, i32 inreg) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
@@ -107,17 +90,13 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %ou
define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512 inreg) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
@@ -132,17 +111,13 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg, i32) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
@@ -157,17 +132,13 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
-; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
@@ -182,17 +153,13 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %ou
define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
-; NO-PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
; PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2
@@ -207,18 +174,14 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %ou
define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
@@ -235,18 +198,14 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
@@ -263,18 +222,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
@@ -291,22 +246,15 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[VAL_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[VAL_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[VAL]] to i32
; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[VAL:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[VAL]] to i32
@@ -325,9 +273,6 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0
; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4
@@ -338,12 +283,11 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD_X]], i32 0
; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[LOAD_Y]], i32 1
; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[LOAD_Z]], i32 2
-; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0
; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4
@@ -374,19 +318,15 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -405,19 +345,15 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -436,19 +372,15 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -467,9 +399,6 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
@@ -483,12 +412,11 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0
; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1
; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2
-; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
@@ -525,19 +453,15 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -556,19 +480,15 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Y_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOADREMAINDER_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -587,19 +507,15 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -618,9 +534,6 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
@@ -634,12 +547,11 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0
; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1
; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2
-; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18
; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
@@ -676,19 +588,15 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32
-; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
@@ -708,17 +616,13 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) %ou
define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) %out, i192 %t0, i32 %t1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i192 [[T0:%.*]], i32 [[T1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i192 inreg [[T0:%.*]], i32 inreg [[T1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
@@ -733,9 +637,6 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) %out, i
define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
; NO-PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
@@ -748,12 +649,11 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD0]], i32 0
; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1
; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2
-; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z
; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
-; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8
; PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
index 658ef33f74935d..712521d0573474 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
@@ -1,30 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=2 -S < %s | FileCheck -check-prefix=PRELOAD-2 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD-ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=2 -S < %s | FileCheck -check-prefix=PRELOAD-2 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD-ALL %s
define amdgpu_kernel void @ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; NO-PRELOAD-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-2-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PRELOAD-ALL-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -37,37 +30,22 @@ define amdgpu_kernel void @ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(
define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
-; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
-; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
-; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
@@ -84,63 +62,30 @@ define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr
define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
-; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 40
-; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 48
-; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56
-; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
+; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24
-; PRELOAD-2-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
-; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 40
-; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 48
-; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56
-; PRELOAD-2-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-2-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
-; PRELOAD-2-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; PRELOAD-2-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
+; PRELOAD-2-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-ALL-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56
-; PRELOAD-ALL-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-ALL-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
@@ -148,7 +93,7 @@ define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel(ptr ad
; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-ALL-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
-; PRELOAD-ALL-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
+; PRELOAD-ALL-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3]], align 4
; PRELOAD-ALL-NEXT: ret void
;
%load = load i32, ptr addrspace(1) %in
@@ -165,37 +110,22 @@ define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel(ptr ad
define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 24
-; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 24
-; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
@@ -212,37 +142,22 @@ define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel(ptr addrspace
define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 24
-; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 24
-; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
@@ -259,46 +174,26 @@ define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel(
define amdgpu_kernel void @i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel
; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 24
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 32
-; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel
; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 24
-; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 32
-; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
-; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
+; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
-; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel
; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
@@ -319,36 +214,22 @@ define amdgpu_kernel void @i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel(i16 %arg0,
define amdgpu_kernel void @i16_i16_ptr1_kernel(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel
; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
-; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
-; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel
; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel
; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
@@ -365,26 +246,18 @@ define amdgpu_kernel void @i16_i16_ptr1_kernel(i16 %arg0, i16 %arg1, ptr addrspa
define amdgpu_kernel void @ptr1_i8_kernel(ptr addrspace(1) %out, i8 %arg0) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
-; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -397,26 +270,18 @@ define amdgpu_kernel void @ptr1_i8_kernel(ptr addrspace(1) %out, i8 %arg0) {
define amdgpu_kernel void @ptr1_i8_zeroext_kernel(ptr addrspace(1) %out, i8 zeroext %arg0) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 zeroext [[ARG0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
-; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -429,26 +294,18 @@ define amdgpu_kernel void @ptr1_i8_zeroext_kernel(ptr addrspace(1) %out, i8 zero
define amdgpu_kernel void @ptr1_i16_kernel(ptr addrspace(1) %out, i16 %arg0) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
-; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -461,23 +318,16 @@ define amdgpu_kernel void @ptr1_i16_kernel(ptr addrspace(1) %out, i16 %arg0) {
define amdgpu_kernel void @ptr1_i32_kernel(ptr addrspace(1) %out, i32 %arg0) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i32_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[ARG0:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i32_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i32_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -488,29 +338,18 @@ define amdgpu_kernel void @ptr1_i32_kernel(ptr addrspace(1) %out, i32 %arg0) {
define amdgpu_kernel void @i32_ptr1_i32_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel
; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0_LOAD]], [[ARG1_LOAD]]
-; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel
; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1_LOAD]]
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel
; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -523,38 +362,22 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel(i32 %arg0, ptr addrspace(1) %out,
define amdgpu_kernel void @ptr1_i16_i16_kernel(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
-; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32
-; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
+; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
-; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8
-; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
-; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32
+; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
@@ -571,25 +394,16 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel(ptr addrspace(1) %out, i16 %arg0,
define amdgpu_kernel void @ptr1_v2i8_kernel(ptr addrspace(1) %out, <2 x i8> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to <2 x i8>
-; NO-PRELOAD-NEXT: store <2 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: ret void
;
@@ -600,37 +414,23 @@ define amdgpu_kernel void @ptr1_v2i8_kernel(ptr addrspace(1) %out, <2 x i8> %in)
define amdgpu_kernel void @ptr1_byref_i32_i32_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256
-; NO-PRELOAD-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260
-; NO-PRELOAD-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
-; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256
-; PRELOAD-2-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260
-; PRELOAD-2-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4
; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-ALL-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256
-; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260
-; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4
; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
%in = load i32, ptr addrspace(4) %in.byref
@@ -642,37 +442,23 @@ define amdgpu_kernel void @ptr1_byref_i32_i32_kernel(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @ptr1_byref_i32_i32_staggered_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256
-; NO-PRELOAD-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260
-; NO-PRELOAD-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
-; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
-; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256
-; PRELOAD-2-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260
-; PRELOAD-2-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4
; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-ALL-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256
-; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260
-; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
+; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4
; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4
-; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
%in = load i32, ptr addrspace(4) %in.byref
@@ -684,28 +470,17 @@ define amdgpu_kernel void @ptr1_byref_i32_i32_staggered_kernel(ptr addrspace(1)
define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8 x i32> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32
-; PRELOAD-2-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-ALL-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32
-; PRELOAD-ALL-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-ALL-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
store <8 x i32> %in, ptr addrspace(1) %out, align 4
@@ -715,24 +490,16 @@ define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8
define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3 x i16> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i16> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NO-PRELOAD-NEXT: store <3 x i16> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -743,24 +510,16 @@ define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3
define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3 x i32> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i32> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NO-PRELOAD-NEXT: store <3 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -771,24 +530,16 @@ define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3
define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3 x float> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x float> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NO-PRELOAD-NEXT: store <3 x float> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -799,23 +550,16 @@ define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3
define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5 x i8> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <5 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <5 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -826,28 +570,17 @@ define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5
define amdgpu_kernel void @ptr1_v5f64_kernel(ptr addrspace(1) nocapture %out, <5 x double> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64
-; PRELOAD-2-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-ALL-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64
-; PRELOAD-ALL-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-ALL-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
store <5 x double> %in, ptr addrspace(1) %out, align 8
@@ -857,23 +590,16 @@ define amdgpu_kernel void @ptr1_v5f64_kernel(ptr addrspace(1) nocapture %out, <5
define amdgpu_kernel void @ptr1_v8i8_kernel(ptr addrspace(1) %out, <8 x i8> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <8 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <8 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
@@ -884,23 +610,16 @@ define amdgpu_kernel void @ptr1_v8i8_kernel(ptr addrspace(1) %out, <8 x i8> %in)
define amdgpu_kernel void @ptr1_i64_kernel(ptr addrspace(1) %out, i64 %a) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i64_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i64 [[A:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I64_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[A_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I64_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[A_LOAD:%.*]] = load i64, ptr addrspace(4) [[A_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store i64 [[A_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i64_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i64_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
@@ -911,23 +630,16 @@ define amdgpu_kernel void @ptr1_i64_kernel(ptr addrspace(1) %out, i64 %a) {
define amdgpu_kernel void @ptr1_f64_kernel(ptr addrspace(1) %out, double %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_f64_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], double [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_F64_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_F64_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load double, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store double [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_f64_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_f64_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
@@ -938,25 +650,16 @@ define amdgpu_kernel void @ptr1_f64_kernel(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @ptr1_half_kernel(ptr addrspace(1) %out, half %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
-; NO-PRELOAD-NEXT: store half [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: ret void
;
@@ -967,25 +670,16 @@ define amdgpu_kernel void @ptr1_half_kernel(ptr addrspace(1) %out, half %in) {
define amdgpu_kernel void @ptr1_bfloat_kernel(ptr addrspace(1) %out, bfloat %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], bfloat [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to bfloat
-; NO-PRELOAD-NEXT: store bfloat [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: ret void
;
@@ -996,23 +690,16 @@ define amdgpu_kernel void @ptr1_bfloat_kernel(ptr addrspace(1) %out, bfloat %in)
define amdgpu_kernel void @ptr1_v2bfloat_kernel(ptr addrspace(1) %out, <2 x bfloat> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x bfloat> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <2 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <2 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -1023,24 +710,16 @@ define amdgpu_kernel void @ptr1_v2bfloat_kernel(ptr addrspace(1) %out, <2 x bflo
define amdgpu_kernel void @ptr1_v3bfloat_kernel(ptr addrspace(1) %out, <3 x bfloat> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <3 x bfloat> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NO-PRELOAD-NEXT: store <3 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
@@ -1051,23 +730,16 @@ define amdgpu_kernel void @ptr1_v3bfloat_kernel(ptr addrspace(1) %out, <3 x bflo
define amdgpu_kernel void @ptr1_v6bfloat_kernel(ptr addrspace(1) %out, <6 x bfloat> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <6 x bfloat> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <6 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <6 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16
; PRELOAD-ALL-NEXT: ret void
;
@@ -1078,35 +750,18 @@ define amdgpu_kernel void @ptr1_v6bfloat_kernel(ptr addrspace(1) %out, <6 x bflo
define amdgpu_kernel void @ptr1_half_v7bfloat_kernel(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load <7 x bfloat>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 32
-; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store half [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
-; NO-PRELOAD-NEXT: store <7 x bfloat> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; NO-PRELOAD-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
+; NO-PRELOAD-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load <7 x bfloat>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 32
-; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
-; PRELOAD-2-NEXT: store <7 x bfloat> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; PRELOAD-2-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
; PRELOAD-ALL-NEXT: ret void
@@ -1119,24 +774,16 @@ define amdgpu_kernel void @ptr1_half_v7bfloat_kernel(ptr addrspace(1) %out, half
define amdgpu_kernel void @ptr1_i1_kernel(ptr addrspace(1) %out, i1 %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i1 [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
-; NO-PRELOAD-NEXT: store i1 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 1
+; NO-PRELOAD-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1
; PRELOAD-ALL-NEXT: ret void
;
@@ -1147,23 +794,16 @@ define amdgpu_kernel void @ptr1_i1_kernel(ptr addrspace(1) %out, i1 %in) {
define amdgpu_kernel void @ptr1_fp128_kernel(ptr addrspace(1) %out, fp128 %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_fp128_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], fp128 [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_FP128_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_FP128_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load fp128, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store fp128 [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_fp128_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_fp128_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16
; PRELOAD-ALL-NEXT: ret void
;
@@ -1174,23 +814,16 @@ define amdgpu_kernel void @ptr1_fp128_kernel(ptr addrspace(1) %out, fp128 %in) {
define amdgpu_kernel void @ptr1_v7i8_kernel(ptr addrspace(1) %out, <7 x i8> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <7 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <7 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
@@ -1201,23 +834,16 @@ define amdgpu_kernel void @ptr1_v7i8_kernel(ptr addrspace(1) %out, <7 x i8> %in)
define amdgpu_kernel void @ptr1_v7half_kernel(ptr addrspace(1) %out, <7 x half> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7half_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x half> [[IN:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <7 x half>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store <7 x half> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16
+; NO-PRELOAD-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7half_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7half_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16
; PRELOAD-ALL-NEXT: ret void
;
@@ -1228,34 +854,18 @@ define amdgpu_kernel void @ptr1_v7half_kernel(ptr addrspace(1) %out, <7 x half>
define amdgpu_kernel void @ptr1_i16_i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 12
-; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
-; NO-PRELOAD-NEXT: store i32 [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 4
+; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; NO-PRELOAD-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 12
-; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
-; PRELOAD-2-NEXT: store i32 [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 4
+; PRELOAD-2-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -1268,36 +878,18 @@ define amdgpu_kernel void @ptr1_i16_i32_ptr1_kernel(ptr addrspace(1) %out, i16 %
define amdgpu_kernel void @ptr1_i16_v3i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
-; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
-; NO-PRELOAD-NEXT: store <3 x i32> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; NO-PRELOAD-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32
-; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
-; PRELOAD-2-NEXT: store <3 x i32> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16
+; PRELOAD-2-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16
; PRELOAD-ALL-NEXT: ret void
@@ -1310,38 +902,18 @@ define amdgpu_kernel void @ptr1_i16_v3i32_ptr1_kernel(ptr addrspace(1) %out, i16
define amdgpu_kernel void @ptr1_i16_i16_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
-; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
-; NO-PRELOAD-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; NO-PRELOAD-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
-; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
-; PRELOAD-2-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; PRELOAD-2-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2
; PRELOAD-ALL-NEXT: ret void
@@ -1354,40 +926,18 @@ define amdgpu_kernel void @ptr1_i16_i16_ptr1_kernel(ptr addrspace(1) %out, i16 %
define amdgpu_kernel void @ptr1_i16_v2i8_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
-; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = bitcast i16 [[TMP5]] to <2 x i8>
-; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2
-; NO-PRELOAD-NEXT: store <2 x i8> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
+; NO-PRELOAD-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8
-; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
-; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = bitcast i16 [[TMP3]] to <2 x i8>
-; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
-; PRELOAD-2-NEXT: store <2 x i8> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 2
+; PRELOAD-2-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2
; PRELOAD-ALL-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2
; PRELOAD-ALL-NEXT: ret void
@@ -1400,29 +950,18 @@ define amdgpu_kernel void @ptr1_i16_v2i8_ptr1_kernel(ptr addrspace(1) %out, i16
define amdgpu_kernel void @i32_ptr1_i32_staggered_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel
; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 16
-; NO-PRELOAD-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0_LOAD]], [[ARG1_LOAD]]
-; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel
; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 16
-; PRELOAD-2-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1_LOAD]]
+; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel
; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]]
; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
@@ -1435,26 +974,18 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_kernel(i32 %arg0, ptr addrspac
define amdgpu_kernel void @ptr1_i8_i32_trailing_unused_kernel(ptr addrspace(1) %out, i8 %arg0, i32 %unused) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel
; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT]], i64 8
-; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]]
-; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
-; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32
-; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
+; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel
; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] {
-; PRELOAD-2-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel
; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 inreg [[UNUSED:%.*]]) #[[ATTR0]] {
-; PRELOAD-ALL-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32
; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 0f60888bcb2f5d..20858bc603b99b 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -566,13 +566,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
; GFX940-NEXT: .p2align 8
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: .LBB14_0:
-; GFX940-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
; GFX940-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v1, s9
; GFX940-NEXT: v_mov_b32_e32 v2, s10
; GFX940-NEXT: v_mov_b32_e32 v3, s11
@@ -583,6 +580,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
; GFX940-NEXT: v_mov_b32_e32 v2, s6
; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, s12
+; GFX940-NEXT: v_mov_b32_e32 v1, s13
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] offset:32 sc0 sc1
; GFX940-NEXT: s_endpgm
;
; GFX90a-LABEL: v5f64_arg:
@@ -593,13 +594,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
@@ -610,6 +608,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
; GFX90a-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-NEXT: v_mov_b32_e32 v3, s11
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: s_nop 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
@@ -941,17 +943,15 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB23_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-NEXT: global_store_short v3, v0, s[14:15] offset:12
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[14:15]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
@@ -1191,15 +1191,13 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB29_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v4, s8
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[14:15]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 08cc2e4ec7d794..6288a80446cf0f 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -301,7 +301,7 @@ define hidden i32 @called(i32 %a) noinline {
ret i32 %sub
}
-define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
+define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) {
; GFX9-O0-LABEL: call:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_mov_b32 s32, 0
@@ -533,7 +533,7 @@ define i64 @called_i64(i64 %a) noinline {
ret i64 %sub
}
-define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
+define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
; GFX9-O0-LABEL: call_i64:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_mov_b32 s32, 0
@@ -1153,7 +1153,7 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline {
ret i32 %sub
}
-define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
+define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) {
; GFX9-O0-LABEL: strict_wwm_call:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_mov_b32 s32, 0
@@ -1385,7 +1385,7 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
ret i64 %sub
}
-define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
+define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
; GFX9-O0-LABEL: strict_wwm_call_i64:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_mov_b32 s32, 0
>From cdfd7283a029b7147abf1f68bb734ab0d12e3150 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Fri, 31 Jan 2025 09:28:12 -0800
Subject: [PATCH 3/3] Rebase and address comments.
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 ++-
.../preload-implicit-kernargs-IR-lowering.ll | 30 +++++++++-------
.../AMDGPU/preload-kernargs-IR-lowering.ll | 36 +++++++++----------
3 files changed, 39 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index d7c700f40c824b..07e03bdc919ef0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1529,7 +1529,7 @@ static void markKernelArgsAsInreg(SetVector<Function *> &Functions,
for (auto *F : Functions) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
if (!ST.hasKernargPreload() ||
- F->getCallingConv() != CallingConv::AMDGPU_KERNEL || F->arg_empty())
+ F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
continue;
PreloadKernelArgInfo PreloadInfo(*F, ST);
@@ -1541,6 +1541,8 @@ static void markKernelArgsAsInreg(SetVector<Function *> &Functions,
for (Argument &Arg : F->args()) {
// Avoid incompatible attributes and guard against running this pass
// twice.
+ //
+ // TODO: Preload byref kernel arguments
if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
Arg.hasAttribute("amdgpu-hidden-argument"))
break;
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
index c48ddbac43e4e6..ff5d5c8dea5676 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
@@ -3,24 +3,30 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s
-define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
+define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) %out, ptr addrspace(1) byref(i32) %arg) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
-; NO-PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) byref(i32) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; NO-PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[ARG]], align 4
+; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD0]], [[LOAD1]]
+; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
-; PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) byref(i32) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
-; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[ARG]], align 4
+; PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD0]], [[LOAD1]]
+; PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-NEXT: ret void
;
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
- %load = load i32, ptr addrspace(4) %imp_arg_ptr
- store i32 %load, ptr addrspace(1) %out
+ %load0 = load i32, ptr addrspace(4) %imp_arg_ptr
+ %load1 = load i32, ptr addrspace(1) %arg
+ %add = add i32 %load0, %load1
+ store i32 %add, ptr addrspace(1) %out
ret void
}
@@ -87,16 +93,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512 inreg) {
+define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, <16 x i32> inreg) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
-; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <16 x i32> inreg [[TMP0:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
-; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <16 x i32> inreg [[TMP0:%.*]]) #[[ATTR0]] {
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
index 712521d0573474..89ad57a51ff1e0 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll
@@ -469,17 +469,17 @@ define amdgpu_kernel void @ptr1_byref_i32_i32_staggered_kernel(ptr addrspace(1)
define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8 x i32> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
-; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-2-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -489,17 +489,17 @@ define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8
define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3 x i16> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
-; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i16> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <3 x i16> [[IN:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-2-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -509,17 +509,17 @@ define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3
define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3 x i32> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
-; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i32> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <3 x i32> [[IN:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-2-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -529,17 +529,17 @@ define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3
define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3 x float> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
-; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x float> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <3 x float> [[IN:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-2-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -549,17 +549,17 @@ define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3
define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5 x i8> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
-; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x i8> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <5 x i8> [[IN:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-2-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-ALL-NEXT: ret void
;
@@ -569,17 +569,17 @@ define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5
define amdgpu_kernel void @ptr1_v5f64_kernel(ptr addrspace(1) nocapture %out, <5 x double> %in) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
-; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
-; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-2-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-2-NEXT: ret void
;
; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel
-; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
+; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] {
; PRELOAD-ALL-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8
; PRELOAD-ALL-NEXT: ret void
;
More information about the llvm-commits
mailing list