[clang] 6479256 - clang/OpenCL: Apply default attributes to enqueued blocks

Mon Jan 30 11:03:31 PST 2023

Author: Matt Arsenault
Date: 2023-01-30T15:03:15-04:00
New Revision: 647925648a0fde06b5b44086843f0d8f164a71df

URL: https://github.com/llvm/llvm-project/commit/647925648a0fde06b5b44086843f0d8f164a71df
DIFF: https://github.com/llvm/llvm-project/commit/647925648a0fde06b5b44086843f0d8f164a71df.diff

LOG: clang/OpenCL: Apply default attributes to enqueued blocks

This was missing important environment context, like denormal-fp-math
and target-features. Curiously this seems to be losing nounwind. Note
this only fixes the actual invoke kernel. The invoke function is
already setting the default attribute set for internal
functions. However that is still buggy since it's not applying any use
function attributes (it's also missing uniform-work-group-size).

There seem to be too many different functions for setting attributes
with inconsistent behavior. The Function overload of
addDefaultFunctionAttributes seems to miss the target-cpu and
target-features. The AttrBuilder one seems to miss optnone (but that
seems to be disallowed on blocks anyway). Neither one calls
setTargetAttributes, when it probably should. uniform-work-group-size
is also set through AMDGPU code when it should be emitting generically
as a language property.

I also noticed update_cc_test_checks for attributes seem to not
connect the captured attribute variables to the attributes at the end
(although I think the numbers happen to work out correctly).

Added: 
    clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl

Modified: 
    clang/lib/CodeGen/TargetInfo.cpp
    clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
    clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 7e08d42e866f..9c6b8765163f 100644

--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -12441,6 +12441,12 @@ llvm::Value *TargetCodeGenInfo::createEnqueuedBlockKernel(
       CGF.getTypes().ClangCallConvToLLVMCallConv(CallingConv::CC_OpenCLKernel);
   F->setCallingConv(KernelCC);
 
+  llvm::AttrBuilder KernelAttrs(C);
+
+  // FIXME: This is missing setTargetAttributes
+  CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
+  F->addFnAttrs(KernelAttrs);
+
   auto IP = CGF.Builder.saveIP();
   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
   auto &Builder = CGF.Builder;
@@ -12449,10 +12455,6 @@ llvm::Value *TargetCodeGenInfo::createEnqueuedBlockKernel(
   llvm::CallInst *Call = Builder.CreateCall(Invoke, Args);
   Call->setCallingConv(Invoke->getCallingConv());
 
-  // FIXME: Apply default attributes
-  F->addFnAttr(llvm::Attribute::NoUnwind);
-  F->addFnAttr(llvm::Attribute::Convergent);
-
   Builder.CreateRetVoid();
   Builder.restoreIP(IP);
   return F;
@@ -12503,10 +12505,12 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
                                    &CGF.CGM.getModule());
   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
 
-  // FIXME: Apply default attributes
-  F->addFnAttr(llvm::Attribute::NoUnwind);
-  F->addFnAttr(llvm::Attribute::Convergent);
-  F->addFnAttr("enqueued-block");
+  llvm::AttrBuilder KernelAttrs(C);
+  // FIXME: The invoke isn't applying the right attributes either
+  // FIXME: This is missing setTargetAttributes
+  CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
+  KernelAttrs.addAttribute("enqueued-block");
+  F->addFnAttrs(KernelAttrs);
 
   auto IP = CGF.Builder.saveIP();
   auto *BB = llvm::BasicBlock::Create(C, "entry", F);

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index c56aee03441d..e574b1f64c49 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -1,5 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs --prefix-filecheck-ir-name VAR
-// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa %s -fdenormal-fp-math-f32=preserve-sign | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa %s -fdenormal-fp-math-f32=preserve-sign | FileCheck %s --check-prefixes=CHECK,NOCPU
+
+// // Check no-optnone and target-cpu behavior
+// RUN: %clang_cc1 -cl-std=CL2.0 -O1 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa -target-cpu gfx900 -target-feature -sram-ecc -fdenormal-fp-math-f32=preserve-sign %s | FileCheck %s --check-prefixes=CHECK,GFX900
+
 
 typedef struct {int a;} ndrange_t;
 
@@ -36,25 +40,19 @@ kernel void test(global char *a, char b, global long *c, long d) {
   enqueue_kernel(default_queue, flags, ndrange, block);
 }
 
-// Test that target attributes are applied to the functions inserted for the
-// block.
+// The target attribute from the caller is not propagated to the block.
+
+// FIXME: Something is broken and inconsistent. Either the builtin use in the
+// block should be rejected, or the target attribute needs to apply to the block
+// as well.
+// https://github.com/llvm/llvm-project/issues/60005
 __attribute__((target("s-memtime-inst")))
 kernel void test_target_features_kernel(global int *i) {
   queue_t default_queue;
   unsigned flags = 0;
   ndrange_t ndrange;
 
-  enqueue_kernel(default_queue, flags, ndrange,
-                 ^(void) {
-                   __builtin_amdgcn_s_memtime();
-                 });
-}
-
-__attribute__((target("s-memtime-inst")))
-void test_target_features_func(global int *i) {
-  queue_t default_queue;
-  unsigned flags = 0;
-  ndrange_t ndrange;
+  __builtin_amdgcn_s_memtime();
 
   enqueue_kernel(default_queue, flags, ndrange,
                  ^(void) {
@@ -64,374 +62,694 @@ void test_target_features_func(global int *i) {
 
 //.
 // CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0
-// CHECK: @__block_literal_global.1 = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_func_block_invoke }, align 8 #0
 //.
-// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@callee
-// CHECK-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
-// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@test
-// CHECK-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
-// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
-// CHECK-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
-// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// CHECK-NEXT:    [[BLOCK:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
-// CHECK-NEXT:    [[VARTMP2:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// CHECK-NEXT:    [[BLOCK3:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
-// CHECK-NEXT:    [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// CHECK-NEXT:    [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
-// CHECK-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// CHECK-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
-// CHECK-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1
-// CHECK-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8
-// CHECK-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8
-// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// CHECK-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
-// CHECK-NEXT:    store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
-// CHECK-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
-// CHECK-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
-// CHECK-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
-// CHECK-NEXT:    store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
-// CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// CHECK-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
-// CHECK-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
-// CHECK-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
-// CHECK-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
-// CHECK-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
-// CHECK-NEXT:    store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
-// CHECK-NEXT:    store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
-// CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
-// CHECK-NEXT:    store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// CHECK-NEXT:    [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
-// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// CHECK-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
-// CHECK-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
-// CHECK-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
-// CHECK-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
-// CHECK-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
-// CHECK-NEXT:    store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
-// CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
-// CHECK-NEXT:    store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
-// CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
-// CHECK-NEXT:    store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
-// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
-// CHECK-NEXT:    store i64 100, ptr addrspace(5) [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
-// CHECK-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
-// CHECK-NEXT:    store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
-// CHECK-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
-// CHECK-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
-// CHECK-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
-// CHECK-NEXT:    store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
-// CHECK-NEXT:    store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8
-// CHECK-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// CHECK-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// CHECK-NEXT:    [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke
-// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// CHECK-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent nounwind
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
-// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
-// CHECK-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
-// CHECK-NEXT:    call void @__test_block_invoke(ptr [[TMP2]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2
-// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// CHECK-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
-// CHECK-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent nounwind
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
-// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
-// CHECK-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
-// CHECK-NEXT:    call void @__test_block_invoke_2(ptr [[TMP2]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_3
-// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// CHECK-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
-// CHECK-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
-// CHECK-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent nounwind
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
-// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !11 !kernel_arg_access_qual !12 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !14 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
-// CHECK-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr
-// CHECK-NEXT:    call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_4
-// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8
-// CHECK-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
-// CHECK-NEXT:    call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR9:[0-9]+]]
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent nounwind
-// CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
-// CHECK-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
-// CHECK-NEXT:    store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
-// CHECK-NEXT:    call void @__test_block_invoke_4(ptr [[TMP2]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@test_target_features_kernel
-// CHECK-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !15 !kernel_arg_access_qual !8 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !10 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
-// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// CHECK-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
-// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke
-// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent nounwind
-// CHECK-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
-// CHECK-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
-// CHECK-NEXT:    store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
-// CHECK-NEXT:    call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@test_target_features_func
-// CHECK-SAME: (ptr addrspace(1) noundef [[I:%.*]]) #[[ATTR8:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
-// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// CHECK-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
-// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_func_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global.1 to ptr))
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@__test_target_features_func_block_invoke
-// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
-// CHECK-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: convergent nounwind
-// CHECK-LABEL: define {{[^@]+}}@__test_target_features_func_block_invoke_kernel
-// CHECK-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
-// CHECK-NEXT:    store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
-// CHECK-NEXT:    call void @__test_target_features_func_block_invoke(ptr [[TMP2]])
-// CHECK-NEXT:    ret void
+// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@callee
+// NOCPU-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// NOCPU-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// NOCPU-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
+// NOCPU-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@test
+// NOCPU-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// NOCPU-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
+// NOCPU-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// NOCPU-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// NOCPU-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// NOCPU-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
+// NOCPU-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
+// NOCPU-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
+// NOCPU-NEXT:    [[VARTMP2:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK3:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// NOCPU-NEXT:    [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
+// NOCPU-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
+// NOCPU-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
+// NOCPU-NEXT:    [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
+// NOCPU-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
+// NOCPU-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
+// NOCPU-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
+// NOCPU-NEXT:    [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
+// NOCPU-NEXT:    [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
+// NOCPU-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
+// NOCPU-NEXT:    [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
+// NOCPU-NEXT:    [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8
+// NOCPU-NEXT:    [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
+// NOCPU-NEXT:    [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// NOCPU-NEXT:    store i64 100, ptr addrspace(5) [[TMP21]], align 8
+// NOCPU-NEXT:    [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
+// NOCPU-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
+// NOCPU-NEXT:    store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
+// NOCPU-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
+// NOCPU-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
+// NOCPU-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
+// NOCPU-NEXT:    store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8
+// NOCPU-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// NOCPU-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8
+// NOCPU-NEXT:    [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8
+// NOCPU-NEXT:    [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// NOCPU-NEXT:    [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke
+// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
+// NOCPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
+// NOCPU-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent nounwind
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
+// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
+// NOCPU-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// NOCPU-NEXT:    call void @__test_block_invoke(ptr [[TMP2]])
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2
+// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
+// NOCPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
+// NOCPU-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
+// NOCPU-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
+// NOCPU-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent nounwind
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
+// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// NOCPU-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// NOCPU-NEXT:    call void @__test_block_invoke_2(ptr [[TMP2]])
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3
+// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR4]] {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
+// NOCPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
+// NOCPU-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
+// NOCPU-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
+// NOCPU-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
+// NOCPU-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4
+// NOCPU-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
+// NOCPU-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent nounwind
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
+// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !11 !kernel_arg_access_qual !12 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !14 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// NOCPU-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
+// NOCPU-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr
+// NOCPU-NEXT:    call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4
+// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8
+// NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
+// NOCPU-NEXT:    call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]]
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent nounwind
+// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
+// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
+// NOCPU-NEXT:    store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// NOCPU-NEXT:    call void @__test_block_invoke_4(ptr [[TMP2]])
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel
+// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !15 !kernel_arg_access_qual !8 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !10 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// NOCPU-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// NOCPU-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
+// NOCPU-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
+// NOCPU-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
+// NOCPU-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent noinline nounwind optnone
+// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke
+// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
+// NOCPU-NEXT:    ret void
+//
+//
+// NOCPU: Function Attrs: convergent nounwind
+// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
+// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-NEXT:  entry:
+// NOCPU-NEXT:    [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
+// NOCPU-NEXT:    store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// NOCPU-NEXT:    call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])
+// NOCPU-NEXT:    ret void
+//
+//
 //
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900-LABEL: define {{[^@]+}}@callee
+// GFX900-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// GFX900-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// GFX900-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[TBAA3:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
+// GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900-LABEL: define {{[^@]+}}@test
+// GFX900-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !9 !kernel_arg_access_qual !10 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !12 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// GFX900-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
+// GFX900-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// GFX900-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// GFX900-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// GFX900-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
+// GFX900-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
+// GFX900-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// GFX900-NEXT:    [[BLOCK:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
+// GFX900-NEXT:    [[VARTMP2:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// GFX900-NEXT:    [[BLOCK3:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// GFX900-NEXT:    [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// GFX900-NEXT:    [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// GFX900-NEXT:    [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
+// GFX900-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
+// GFX900-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8:[0-9]+]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14:![0-9]+]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18:![0-9]+]]
+// GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
+// GFX900-NEXT:    store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
+// GFX900-NEXT:    store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
+// GFX900-NEXT:    [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
+// GFX900-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
+// GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
+// GFX900-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
+// GFX900-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
+// GFX900-NEXT:    store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
+// GFX900-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
+// GFX900-NEXT:    [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
+// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
+// GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
+// GFX900-NEXT:    store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
+// GFX900-NEXT:    [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
+// GFX900-NEXT:    store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
+// GFX900-NEXT:    [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
+// GFX900-NEXT:    [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// GFX900-NEXT:    store i64 100, ptr addrspace(5) [[TMP21]], align 8
+// GFX900-NEXT:    [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
+// GFX900-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
+// GFX900-NEXT:    store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
+// GFX900-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
+// GFX900-NEXT:    store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
+// GFX900-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
+// GFX900-NEXT:    store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
+// GFX900-NEXT:    [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
+// GFX900-NEXT:    [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke
+// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA13]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
+// GFX900-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// GFX900-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// GFX900-NEXT:    call void @__test_block_invoke(ptr [[TMP2]])
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2
+// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
+// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// GFX900-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// GFX900-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// GFX900-NEXT:    call void @__test_block_invoke_2(ptr [[TMP2]])
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3
+// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR5]] {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA13]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
+// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[TBAA7]]
+// GFX900-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
+// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !23 !kernel_arg_access_qual !24 !kernel_arg_type !25 !kernel_arg_base_type !25 !kernel_arg_type_qual !26 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+// GFX900-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
+// GFX900-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr
+// GFX900-NEXT:    call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4
+// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]
+// GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR9:[0-9]+]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
+// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
+// GFX900-NEXT:    store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// GFX900-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// GFX900-NEXT:    call void @__test_block_invoke_4(ptr [[TMP2]])
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel
+// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space !27 !kernel_arg_access_qual !20 !kernel_arg_type !28 !kernel_arg_base_type !28 !kernel_arg_type_qual !22 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// GFX900-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// GFX900-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
+// GFX900-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
+// GFX900-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[TBAA7]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
+// GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
+// GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
+// GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke
+// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
+// GFX900-NEXT:    ret void
+//
+//
+// GFX900: Function Attrs: convergent nounwind
+// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
+// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-NEXT:  entry:
+// GFX900-NEXT:    [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
+// GFX900-NEXT:    store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
+// GFX900-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// GFX900-NEXT:    call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])
+// GFX900-NEXT:    ret void
+//
+//.
+// NOCPU: attributes #0 = { "objc_arc_inert" }
+// NOCPU: attributes #1 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOCPU: attributes #2 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
+// NOCPU: attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// NOCPU: attributes #4 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOCPU: attributes #5 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOCPU: attributes #6 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
+// NOCPU: attributes #7 = { nocallback nofree nosync nounwind willreturn }
+// NOCPU: attributes #8 = { convergent nounwind }
+//.
+// GFX900: attributes #0 = { "objc_arc_inert" }
+// GFX900: attributes #1 = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
+// GFX900: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+// GFX900: attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// GFX900: attributes #5 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #6 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #7 = { nocallback nofree nosync nounwind willreturn }
+// GFX900: attributes #8 = { nounwind }
+// GFX900: attributes #9 = { convergent nounwind }
 //.
-// CHECK: attributes #0 = { "objc_arc_inert" }
-// CHECK: attributes #1 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #2 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
-// CHECK: attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-// CHECK: attributes #4 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #5 = { convergent nounwind "enqueued-block" }
-// CHECK: attributes #6 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
-// CHECK: attributes #7 = { nocallback nofree nosync nounwind willreturn }
-// CHECK: attributes #8 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" }
-// CHECK: attributes #9 = { convergent nounwind }
+// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// NOCPU: !1 = !{i32 1, !"wchar_size", i32 4}
+// NOCPU: !2 = !{i32 2, i32 0}
+// NOCPU: !3 = !{i32 1, i32 0, i32 1, i32 0}
+// NOCPU: !4 = !{!"none", !"none", !"none", !"none"}
+// NOCPU: !5 = !{!"char*", !"char", !"long*", !"long"}
+// NOCPU: !6 = !{!"", !"", !"", !""}
+// NOCPU: !7 = !{i32 0}
+// NOCPU: !8 = !{!"none"}
+// NOCPU: !9 = !{!"__block_literal"}
+// NOCPU: !10 = !{!""}
+// NOCPU: !11 = !{i32 0, i32 3}
+// NOCPU: !12 = !{!"none", !"none"}
+// NOCPU: !13 = !{!"__block_literal", !"void*"}
+// NOCPU: !14 = !{!"", !""}
+// NOCPU: !15 = !{i32 1}
+// NOCPU: !16 = !{!"int*"}
 //.
-// CHECK: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// CHECK: !1 = !{i32 1, !"wchar_size", i32 4}
-// CHECK: !2 = !{i32 2, i32 0}
-// CHECK: !3 = !{i32 1, i32 0, i32 1, i32 0}
-// CHECK: !4 = !{!"none", !"none", !"none", !"none"}
-// CHECK: !5 = !{!"char*", !"char", !"long*", !"long"}
-// CHECK: !6 = !{!"", !"", !"", !""}
-// CHECK: !7 = !{i32 0}
-// CHECK: !8 = !{!"none"}
-// CHECK: !9 = !{!"__block_literal"}
-// CHECK: !10 = !{!""}
-// CHECK: !11 = !{i32 0, i32 3}
-// CHECK: !12 = !{!"none", !"none"}
-// CHECK: !13 = !{!"__block_literal", !"void*"}
-// CHECK: !14 = !{!"", !""}
-// CHECK: !15 = !{i32 1}
-// CHECK: !16 = !{!"int*"}
+// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// GFX900: !1 = !{i32 1, !"wchar_size", i32 4}
+// GFX900: !2 = !{i32 2, i32 0}
+// GFX900: !3 = !{!4, !4, i64 0}
+// GFX900: !4 = !{!"long", !5, i64 0}
+// GFX900: !5 = !{!"omnipotent char", !6, i64 0}
+// GFX900: !6 = !{!"Simple C/C++ TBAA"}
+// GFX900: !7 = !{!8, !8, i64 0}
+// GFX900: !8 = !{!"any pointer", !5, i64 0}
+// GFX900: !9 = !{i32 1, i32 0, i32 1, i32 0}
+// GFX900: !10 = !{!"none", !"none", !"none", !"none"}
+// GFX900: !11 = !{!"char*", !"char", !"long*", !"long"}
+// GFX900: !12 = !{!"", !"", !"", !""}
+// GFX900: !13 = !{!5, !5, i64 0}
+// GFX900: !14 = !{!15, !15, i64 0}
+// GFX900: !15 = !{!"int", !5, i64 0}
+// GFX900: !16 = !{!17, !17, i64 0}
+// GFX900: !17 = !{!"queue_t", !5, i64 0}
+// GFX900: !18 = !{i64 0, i64 4, !14}
+// GFX900: !19 = !{i32 0}
+// GFX900: !20 = !{!"none"}
+// GFX900: !21 = !{!"__block_literal"}
+// GFX900: !22 = !{!""}
+// GFX900: !23 = !{i32 0, i32 3}
+// GFX900: !24 = !{!"none", !"none"}
+// GFX900: !25 = !{!"__block_literal", !"void*"}
+// GFX900: !26 = !{!"", !""}
+// GFX900: !27 = !{i32 1}
+// GFX900: !28 = !{!"int*"}
 //.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}

diff  --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
new file mode 100644
index 000000000000..a917d128d09d
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -0,0 +1,191 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
+// RUN: %clang_cc1 -fno-ident -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" -fdenormal-fp-math-f32=preserve-sign -cl-uniform-work-group-size | FileCheck --check-prefix=SPIR32 %s
+// RUN: %clang_cc1 -fno-ident -ffp-exception-behavior=strict -fexperimental-strict-floating-point -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck --check-prefix=STRICTFP %s
+
+
+// Test that attributes are correctly applied to functions introduced for
+// enqueued blocks
+
+
+typedef void (^bl_t)(local void *);
+typedef struct {int a;} ndrange_t;
+
+kernel void device_side_enqueue(global float *a, global float *b, int i) {
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(void) {
+                   a[i] = 4.0f * b[i] + 1.0f;
+                 });
+}
+// SPIR32: Function Attrs: convergent noinline norecurse nounwind optnone
+// SPIR32-LABEL: define {{[^@]+}}@device_side_enqueue
+// SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+// SPIR32-NEXT:  entry:
+// SPIR32-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR32-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR32-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
+// SPIR32-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr, align 4
+// SPIR32-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4
+// SPIR32-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4
+// SPIR32-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4
+// SPIR32-NEXT:    [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4
+// SPIR32-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4
+// SPIR32-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4
+// SPIR32-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
+// SPIR32-NEXT:    store i32 0, ptr [[FLAGS]], align 4
+// SPIR32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DEFAULT_QUEUE]], align 4
+// SPIR32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FLAGS]], align 4
+// SPIR32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false)
+// SPIR32-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 0
+// SPIR32-NEXT:    store i32 24, ptr [[BLOCK_SIZE]], align 4
+// SPIR32-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 1
+// SPIR32-NEXT:    store i32 4, ptr [[BLOCK_ALIGN]], align 4
+// SPIR32-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 2
+// SPIR32-NEXT:    store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE]], align 4
+// SPIR32-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 3
+// SPIR32-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4
+// SPIR32-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 4
+// SPIR32-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 4
+// SPIR32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I_ADDR]], align 4
+// SPIR32-NEXT:    store i32 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 4
+// SPIR32-NEXT:    [[BLOCK_CAPTURED2:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 5
+// SPIR32-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4
+// SPIR32-NEXT:    store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4
+// SPIR32-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4)
+// SPIR32-NEXT:    [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(ptr [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
+// SPIR32-NEXT:    ret void
+//
+//
+// SPIR32: Function Attrs: convergent noinline nounwind optnone
+// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke
+// SPIR32-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] {
+// SPIR32-NEXT:  entry:
+// SPIR32-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4
+// SPIR32-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4
+// SPIR32-NEXT:    store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4
+// SPIR32-NEXT:    store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR]], align 4
+// SPIR32-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
+// SPIR32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR]], align 4
+// SPIR32-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// SPIR32-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR1]], align 4
+// SPIR32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP0]], i32 [[TMP1]]
+// SPIR32-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4
+// SPIR32-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00)
+// SPIR32-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// SPIR32-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR2]], align 4
+// SPIR32-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// SPIR32-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR3]], align 4
+// SPIR32-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP4]], i32 [[TMP5]]
+// SPIR32-NEXT:    store float [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 4
+// SPIR32-NEXT:    ret void
+//
+//
+// SPIR32: Function Attrs: convergent nounwind
+// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel
+// SPIR32-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] {
+// SPIR32-NEXT:  entry:
+// SPIR32-NEXT:    call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]])
+// SPIR32-NEXT:    ret void
+//
+//
+// STRICTFP: Function Attrs: convergent noinline norecurse nounwind optnone strictfp
+// STRICTFP-LABEL: define {{[^@]+}}@device_side_enqueue
+// STRICTFP-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+// STRICTFP-NEXT:  entry:
+// STRICTFP-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// STRICTFP-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// STRICTFP-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
+// STRICTFP-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr, align 4
+// STRICTFP-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4
+// STRICTFP-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4
+// STRICTFP-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4
+// STRICTFP-NEXT:    [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4
+// STRICTFP-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4
+// STRICTFP-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4
+// STRICTFP-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
+// STRICTFP-NEXT:    store i32 0, ptr [[FLAGS]], align 4
+// STRICTFP-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DEFAULT_QUEUE]], align 4
+// STRICTFP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FLAGS]], align 4
+// STRICTFP-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR5:[0-9]+]]
+// STRICTFP-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 0
+// STRICTFP-NEXT:    store i32 24, ptr [[BLOCK_SIZE]], align 4
+// STRICTFP-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 1
+// STRICTFP-NEXT:    store i32 4, ptr [[BLOCK_ALIGN]], align 4
+// STRICTFP-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 2
+// STRICTFP-NEXT:    store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE]], align 4
+// STRICTFP-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 3
+// STRICTFP-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4
+// STRICTFP-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 4
+// STRICTFP-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 4
+// STRICTFP-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I_ADDR]], align 4
+// STRICTFP-NEXT:    store i32 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 4
+// STRICTFP-NEXT:    [[BLOCK_CAPTURED2:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 5
+// STRICTFP-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4
+// STRICTFP-NEXT:    store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4
+// STRICTFP-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4)
+// STRICTFP-NEXT:    [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(ptr [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
+// STRICTFP-NEXT:    ret void
+//
+//
+// STRICTFP: Function Attrs: convergent noinline nounwind optnone strictfp
+// STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke
+// STRICTFP-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] {
+// STRICTFP-NEXT:  entry:
+// STRICTFP-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4
+// STRICTFP-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4
+// STRICTFP-NEXT:    store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4
+// STRICTFP-NEXT:    store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR]], align 4
+// STRICTFP-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
+// STRICTFP-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR]], align 4
+// STRICTFP-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// STRICTFP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR1]], align 4
+// STRICTFP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP0]], i32 [[TMP1]]
+// STRICTFP-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4
+// STRICTFP-NEXT:    [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR5]]
+// STRICTFP-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
+// STRICTFP-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR2]], align 4
+// STRICTFP-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
+// STRICTFP-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR3]], align 4
+// STRICTFP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP4]], i32 [[TMP5]]
+// STRICTFP-NEXT:    store float [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 4
+// STRICTFP-NEXT:    ret void
+//
+//
+// STRICTFP: Function Attrs: convergent nounwind
+// STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel
+// STRICTFP-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] {
+// STRICTFP-NEXT:  entry:
+// STRICTFP-NEXT:    call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) #[[ATTR5]]
+// STRICTFP-NEXT:    ret void
+//
+//.
+// SPIR32: attributes #0 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// SPIR32: attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// SPIR32: attributes #2 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// SPIR32: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// SPIR32: attributes #4 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+//.
+// STRICTFP: attributes #0 = { convergent noinline norecurse nounwind optnone strictfp "stack-protector-buffer-size"="8" "strictfp" "uniform-work-group-size"="false" }
+// STRICTFP: attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// STRICTFP: attributes #2 = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" }
+// STRICTFP: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+// STRICTFP: attributes #4 = { convergent nounwind "stack-protector-buffer-size"="8" }
+// STRICTFP: attributes #5 = { strictfp }
+//.
+// SPIR32: !0 = !{i32 1, !"wchar_size", i32 4}
+// SPIR32: !1 = !{i32 2, i32 0}
+// SPIR32: !2 = !{i32 1, i32 1, i32 0}
+// SPIR32: !3 = !{!"none", !"none", !"none"}
+// SPIR32: !4 = !{!"float*", !"float*", !"int"}
+// SPIR32: !5 = !{!"", !"", !""}
+//.
+// STRICTFP: !0 = !{i32 1, !"wchar_size", i32 4}
+// STRICTFP: !1 = !{i32 2, i32 0}
+// STRICTFP: !2 = !{i32 1, i32 1, i32 0}
+// STRICTFP: !3 = !{!"none", !"none", !"none"}
+// STRICTFP: !4 = !{!"float*", !"float*", !"int"}
+// STRICTFP: !5 = !{!"", !"", !""}
+//.

diff  --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index bce1a922668a..05a15e476f21 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -393,7 +393,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
 // COMMON:  ret void
 // COMMON: }
 // COMMON: define spir_kernel void [[INVLK2]](ptr addrspace(4){{.*}})
-// COMMON: define spir_kernel void [[INVGK1]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})  [[INVOKE_ATTR:#[0-9]+]]
+// COMMON: define spir_kernel void [[INVGK1]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})  [[INVOKE_KERNEL_ATTR:#[0-9]+]]
 // COMMON: define spir_kernel void [[INVGK2]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})
 // COMMON: define spir_kernel void [[INVGK3]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})
 // COMMON: define spir_kernel void [[INVGK4]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})
@@ -404,7 +404,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
 // COMMON:  ret void
 // COMMON: }
 // COMMON: define spir_kernel void [[INVGK7]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})
-// COMMON: define internal spir_func void [[INVG8]](ptr addrspace(4){{.*}})
+// COMMON: define internal spir_func void [[INVG8]](ptr addrspace(4){{.*}}) [[INVG8_INVOKE_FUNC_ATTR:#[0-9]+]]
 // COMMON: define internal spir_func void [[INVG9]](ptr addrspace(4){{.*}}, ptr addrspace(3) %{{.*}})
 // COMMON: define spir_kernel void [[INVGK8]](ptr addrspace(4){{.*}})
 // COMMON: define spir_kernel void [[INV_G_K]](ptr addrspace(4){{.*}}, ptr addrspace(3){{.*}})
@@ -413,4 +413,5 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
 // COMMON: define spir_kernel void [[INVGK10]](ptr addrspace(4){{.*}})
 // COMMON: define spir_kernel void [[INVGK11]](ptr addrspace(4){{.*}})
 
-// COMMON: attributes [[INVOKE_ATTR]] = { convergent nounwind }
+// COMMON: attributes [[INVG8_INVOKE_FUNC_ATTR]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// COMMON: attributes [[INVOKE_KERNEL_ATTR]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }