[clang] [Clang][AMDGPU] Avoid Using `byval` for `ndrange_t` when emitting `__enqueue_kernel_basic` (PR #116435)
Shilei Tian via cfe-commits
cfe-commits at lists.llvm.org
Fri Nov 15 12:35:53 PST 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/116435
>From 50b6941f3ee43f5c4e559112fbb3fd8fa9fb6016 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Fri, 15 Nov 2024 15:04:39 -0500
Subject: [PATCH] [Clang][AMDGPU] Avoid Using `byval` for `ndrange_t` when
emitting `__enqueue_kernel_basic`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
AMDGPU disabled the use of `byval` for struct argument passing in commit
d77c620. However, when emitting `__enqueue_kernel_basic`, Clang still adds the
`byval` attribute by default. This PR introduces a target check to avoid
emitting the `byval` attribute unnecessarily.
FWIW, I’m not sure if it’s a good idea to emit the `byval` attribute by default
in this context. The way arguments are passed is highly target-dependent, and
making assumptions here doesn’t seem ideal.
---
clang/lib/CodeGen/CGBuiltin.cpp | 11 ++---------
.../test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 16 ++++++++--------
.../cl20-device-side-enqueue-attributes.cl | 4 ++--
.../CodeGenOpenCL/cl20-device-side-enqueue.cl | 12 ++++++------
4 files changed, 18 insertions(+), 25 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e03cb6e8c3c861..0f29dbb9509296 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5985,15 +5985,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
llvm::Value *Block =
Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
- AttrBuilder B(Builder.getContext());
- B.addByValAttr(NDRangeL.getAddress().getElementType());
- llvm::AttributeList ByValAttrSet =
- llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
-
- auto RTCall =
- EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
- {Queue, Flags, Range, Kernel, Block});
- RTCall->setAttributes(ByValAttrSet);
+ auto RTCall = EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
+ {Queue, Flags, Range, Kernel, Block});
return RValue::get(RTCall);
}
assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 2fc7f9a24b8874..62b5661da9dbd8 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -140,7 +140,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
// NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
// NOCPU-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8
-// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
+// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
// NOCPU-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
// NOCPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
@@ -162,7 +162,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
// NOCPU-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
// NOCPU-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8
-// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
+// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
// NOCPU-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
// NOCPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
@@ -204,7 +204,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
// NOCPU-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8
-// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
+// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
// NOCPU-NEXT: ret void
//
//
@@ -365,7 +365,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
-// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
// NOCPU-NEXT: ret void
//
//
@@ -486,7 +486,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA13]]
// GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA13]]
-// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
+// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
@@ -508,7 +508,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
// GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
+// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
@@ -553,7 +553,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
// GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA13]]
-// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
+// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
@@ -709,7 +709,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
-// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
index 1544770dfa4f30..451d30b4d86f0e 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -55,7 +55,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
// SPIR32-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4
// SPIR32-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4
// SPIR32-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4)
-// SPIR32-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
+// SPIR32-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
// SPIR32-NEXT: ret void
//
//
@@ -126,7 +126,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
// STRICTFP-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4
// STRICTFP-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4
// STRICTFP-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4)
-// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
+// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) #[[ATTR5]]
// STRICTFP-NEXT: ret void
//
//
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index f0c164795b7642..a1408d38a44c90 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -89,8 +89,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: store ptr addrspace(4) addrspacecast (ptr [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to ptr addrspace(4)), ptr %block.invoke
// COMMON: [[BL_I8:%[0-9]+]] ={{.*}} addrspacecast ptr %block to ptr addrspace(4)
// COMMON-LABEL: call {{(spir_func )?}}i32 @__enqueue_kernel_basic(
- // SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
- // X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
+ // SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
+ // X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr [[INVLK1:[^ ]+_kernel]] to ptr addrspace(4)),
// COMMON-SAME: ptr addrspace(4) [[BL_I8]])
enqueue_kernel(default_queue, flags, ndrange,
@@ -333,8 +333,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// X86: [[DEF_Q:%[0-9]+]] = load ptr, ptr %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, ptr %flags
// COMMON-LABEL: call {{(spir_func )?}}i32 @__enqueue_kernel_basic(
- // SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
- // X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
+ // SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
+ // X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr [[INVGK8:[^ ]+_kernel]] to ptr addrspace(4)),
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr addrspace(1) [[BLG8]] to ptr addrspace(4)))
enqueue_kernel(default_queue, flags, ndrange, block_A);
@@ -382,8 +382,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: [[FLAGS:%[0-9]+]] = load i32, ptr %flags
// COMMON: [[BL_I8:%[0-9]+]] ={{.*}} addrspacecast ptr {{.*}} to ptr addrspace(4)
// COMMON-LABEL: call {{(spir_func )?}}i32 @__enqueue_kernel_basic(
- // SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
- // X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
+ // SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
+ // X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr [[INVLK3:[^ ]+_kernel]] to ptr addrspace(4)),
// COMMON-SAME: ptr addrspace(4) [[BL_I8]])
enqueue_kernel(default_queue, flags, ndrange, block_C);
More information about the cfe-commits
mailing list