[clang] [clang][OpenCL][CodeGen][AMDGPU] Do not use `private` as the default AS for when `generic` is available (PR #112442)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Oct 15 14:54:57 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Alex Voicu (AlexVlx)
<details>
<summary>Changes</summary>
Currently, for AMDGPU, when compiling for OpenCL, we unconditionally use `private` as the default address space. This is wrong for cases where the `generic` address space is available, and is corrected via this patch. In general, this AS map abuse is a bad hack and we should re-work it altogether, but at least after this patch we will stop being incorrect for e.g. OpenCL 2.0.
---
Patch is 367.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112442.diff
20 Files Affected:
- (modified) clang/lib/Basic/Targets/AMDGPU.cpp (+3-3)
- (modified) clang/lib/CodeGen/CGBlocks.cpp (+2-1)
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+3-2)
- (modified) clang/test/CodeGenOpenCL/addr-space-struct-arg.cl (+1440-81)
- (modified) clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl (+85-41)
- (modified) clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl (+164-118)
- (modified) clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl (+275-220)
- (modified) clang/test/CodeGenOpenCL/amdgpu-nullptr.cl (+14-14)
- (modified) clang/test/CodeGenOpenCL/atomic-ops.cl (+808-219)
- (modified) clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl (+3-3)
- (modified) clang/test/CodeGenOpenCL/blocks.cl (+11-12)
- (modified) clang/test/CodeGenOpenCL/builtins-alloca.cl (+428-4)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (+93-62)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl (+18-12)
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+2-2)
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl (+1-1)
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+1-1)
- (modified) clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl (+1-1)
- (modified) clang/test/CodeGenOpenCL/opencl_types.cl (+1-1)
- (modified) clang/test/Index/pipe-size.cl (+2-2)
``````````diff
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 3b748d0249d57b..078819183afdac 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -260,9 +260,9 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
void AMDGPUTargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
TargetInfo::adjust(Diags, Opts);
// ToDo: There are still a few places using default address space as private
- // address space in OpenCL, which needs to be cleaned up, then Opts.OpenCL
- // can be removed from the following line.
- setAddressSpaceMap(/*DefaultIsPrivate=*/Opts.OpenCL ||
+ // address space in OpenCL, which needs to be cleaned up, then the references
+ // to OpenCL can be removed from the following line.
+ setAddressSpaceMap((Opts.OpenCL && !Opts.OpenCLGenericAddressSpace) ||
!isAMDGCN(getTriple()));
}
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 684fda74407313..c3a266285011fe 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1397,7 +1397,8 @@ void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D,
DI->setLocation(D->getLocation());
DI->EmitDeclareOfBlockLiteralArgVariable(
*BlockInfo, D->getName(), argNum,
- cast<llvm::AllocaInst>(alloc.getPointer()), Builder);
+ cast<llvm::AllocaInst>(alloc.getPointer()->stripPointerCasts()),
+ Builder);
}
}
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 157e743a39bfbc..1b01484a46cf74 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5858,7 +5858,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
llvm::Value *TmpPtr = Tmp.getPointer();
llvm::Value *TmpSize = EmitLifetimeStart(
- CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
+ CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()),
+ TmpPtr->stripPointerCasts());
llvm::Value *ElemPtr;
// Each of the following arguments specifies the size of the corresponding
// argument passed to the enqueued block.
@@ -5903,7 +5904,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
auto Call = RValue::get(
EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
if (TmpSize)
- EmitLifetimeEnd(TmpSize, TmpPtr);
+ EmitLifetimeEnd(TmpSize, TmpPtr->stripPointerCasts());
return Call;
}
// Any calls now have event arguments passed.
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index bab0e21067eeae..cb26fc6e8fcaba 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -enable-var-scope -check-prefixes=ALL,X86 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -enable-var-scope -check-prefixes=SPIR %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -check-prefixes=X86 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN20 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -check-prefixes=SPIR %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -check-prefixes=AMDGCN30 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN30-GVAR %s
typedef int int2 __attribute__((ext_vector_type(2)));
@@ -45,147 +46,1505 @@ struct LargeStructTwoMember {
struct LargeStructOneMember g_s;
#endif
-// X86-LABEL: define{{.*}} void @foo(ptr dead_on_unwind noalias writable sret(%struct.Mat4X4) align 4 %agg.result, ptr noundef byval(%struct.Mat3X3) align 4 %in)
-// AMDGCN-LABEL: define{{.*}} %struct.Mat4X4 @foo([9 x i32] %in.coerce)
Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
Mat4X4 out;
return out;
}
-// ALL-LABEL: define {{.*}} void @ker
-// Expect two mem copies: one for the argument "in", and one for
-// the return value.
-// X86: call void @llvm.memcpy.p0.p1.i32(ptr
-// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1)
-
-// AMDGCN: load [9 x i32], ptr addrspace(1)
-// AMDGCN: call %struct.Mat4X4 @foo([9 x i32]
-// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1)
kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
out[0] = foo(in[1]);
}
-// X86-LABEL: define{{.*}} void @foo_large(ptr dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr noundef byval(%struct.Mat32X32) align 4 %in)
-// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byref(%struct.Mat32X32) align 4 %{{.*}}
-// AMDGCN: %in = alloca %struct.Mat32X32, align 4, addrspace(5)
-// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 %in, ptr addrspace(5) align 4 %{{.*}}, i64 4096, i1 false)
Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
Mat64X64 out;
return out;
}
-// ALL-LABEL: define {{.*}} void @ker_large
-// Expect two mem copies: one for the argument "in", and one for
-// the return value.
-// X86: call void @llvm.memcpy.p0.p1.i32(ptr
-// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1)
-// AMDGCN: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1)
kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {
out[0] = foo_large(in[1]);
}
-// AMDGCN-LABEL: define{{.*}} void @FuncOneMember(<2 x i32> %u.coerce)
void FuncOneMember(struct StructOneMember u) {
u.x = (int2)(0, 0);
}
-// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %{{.*}}
-// AMDGCN: %u = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %u, ptr addrspace(5) align 8 %{{.*}}, i64 800, i1 false)
-// AMDGCN-NOT: addrspacecast
-// AMDGCN: store <2 x i32> %{{.*}}, ptr addrspace(5)
void FuncOneLargeMember(struct LargeStructOneMember u) {
u.x[0] = (int2)(0, 0);
}
-// AMDGCN20-LABEL: define{{.*}} void @test_indirect_arg_globl()
-// AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN20: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
-// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables))
void test_indirect_arg_globl(void) {
FuncOneLargeMember(g_s);
}
#endif
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @test_indirect_arg_local()
-// AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
-// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
kernel void test_indirect_arg_local(void) {
local struct LargeStructOneMember l_s;
FuncOneLargeMember(l_s);
}
-// AMDGCN-LABEL: define{{.*}} void @test_indirect_arg_private()
-// AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN-NOT: @llvm.memcpy
-// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[p_s]])
void test_indirect_arg_private(void) {
struct LargeStructOneMember p_s;
FuncOneLargeMember(p_s);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelOneMember
-// AMDGCN-SAME: (<2 x i32> %[[u_coerce:.*]])
-// AMDGCN: %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5)
-// AMDGCN: %[[coerce_dive:.*]] = getelementptr inbounds nuw %struct.StructOneMember, ptr addrspace(5) %[[u]], i32 0, i32 0
-// AMDGCN: store <2 x i32> %[[u_coerce]], ptr addrspace(5) %[[coerce_dive]]
-// AMDGCN: call void @FuncOneMember(<2 x i32>
kernel void KernelOneMember(struct StructOneMember u) {
FuncOneMember(u);
}
-// SPIR: call void @llvm.memcpy.p0.p1.i32
-// SPIR-NOT: addrspacecast
kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
FuncOneMember(*u);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: %[[U_ELEM:.*]] = getelementptr inbounds nuw %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0
-// AMDGCN: %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0
-// AMDGCN: store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8
-// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]])
kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
FuncOneLargeMember(u);
}
-// AMDGCN-LABEL: define{{.*}} void @FuncTwoMember(<2 x i32> %u.coerce0, <2 x i32> %u.coerce1)
void FuncTwoMember(struct StructTwoMember u) {
u.y = (int2)(0, 0);
}
-// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember
-// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]])
-// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %[[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
void FuncLargeTwoMember(struct LargeStructTwoMember u) {
u.y[0] = (int2)(0, 0);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelTwoMember
-// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]])
-// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5)
-// AMDGCN: %[[LD0:.*]] = load <2 x i32>, ptr addrspace(5)
-// AMDGCN: %[[LD1:.*]] = load <2 x i32>, ptr addrspace(5)
-// AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]])
kernel void KernelTwoMember(struct StructTwoMember u) {
FuncTwoMember(u);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember
-// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]])
-// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN: %[[U_PTR0:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0
-// AMDGCN: %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0
-// AMDGCN: store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]]
-// AMDGCN: %[[U_PTR1:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1
-// AMDGCN: %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1
-// AMDGCN: store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]]
-// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]])
kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
FuncLargeTwoMember(u);
}
+//.
+// X86: @test_indirect_arg_local.l_s = internal addrspace(3) global %struct.LargeStructOneMember undef, align 8
+//.
+// AMDGCN: @test_indirect_arg_local.l_s = internal addrspace(3) global %struct.LargeStructOneMember undef, align 8
+// AMDGCN: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// AMDGCN20: @g_s = addrspace(1) global %struct.LargeStructOneMember zeroinitializer, align 8
+// AMDGCN20: @test_indirect_arg_local.l_s = internal addrspace(3) global %struct.LargeStructOneMember undef, align 8
+// AMDGCN20: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// SPIR: @test_indirect_arg_local.l_s = internal addrspace(3) global %struct.LargeStructOneMember undef, align 8
+//.
+// AMDGCN30: @g_s = addrspace(1) global %struct.LargeStructOneMember zeroinitializer, align 8
+// AMDGCN30: @test_indirect_arg_local.l_s = internal addrspace(3) global %struct.LargeStructOneMember undef, align 8
+// AMDGCN30: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// AMDGCN30-GVAR: @test_indirect_arg_local.l_s = internal addrspace(3) global %struct.LargeStructOneMember undef, align 8
+// AMDGCN30-GVAR: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define void @foo(
+// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT: ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define spir_kernel void @ker(
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
+// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
+// X86-NEXT: ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define void @foo_large(
+// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT: ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define spir_kernel void @ker_large(
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
+// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
+// X86-NEXT: ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define void @FuncOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
+// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8
+// X86-NEXT: ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define void @FuncOneLargeMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
+// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0
+// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// X86-NEXT: ret void
+//
+//
+// X86...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/112442
More information about the cfe-commits
mailing list