[clang] 7a2e12e - [CodeGen][OpenMP] Use correct type in EmitLoadOfPointer()

Nikita Popov via cfe-commits cfe-commits at lists.llvm.org
Mon Mar 21 07:22:44 PDT 2022


Author: Nikita Popov
Date: 2022-03-21T15:22:37+01:00
New Revision: 7a2e12e0a7132739bb2c609477093d2b26ee4f3a

URL: https://github.com/llvm/llvm-project/commit/7a2e12e0a7132739bb2c609477093d2b26ee4f3a
DIFF: https://github.com/llvm/llvm-project/commit/7a2e12e0a7132739bb2c609477093d2b26ee4f3a.diff

LOG: [CodeGen][OpenMP] Use correct type in EmitLoadOfPointer()

The EmitLoadOfPointer() call already specified the right pointer
type, but it did not match the Address we're loading from, so we
need to insert a bitcast first.

Added: 
    

Modified: 
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
    clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
    clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
    clang/test/OpenMP/reduction_implicit_map.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index a6cf18c43c19c..1b34cff44e764 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1899,14 +1899,17 @@ static void emitReductionListCopy(
     // new element.
     bool IncrScratchpadSrc = false;
     bool IncrScratchpadDest = false;
+    QualType PrivatePtrType = C.getPointerType(Private->getType());
+    llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
 
     switch (Action) {
     case RemoteLaneToThread: {
       // Step 1.1: Get the address for the src element in the Reduce list.
       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr,
-          C.getPointerType(Private->getType())->castAs<PointerType>());
+      SrcElementAddr =
+          CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
+                                    SrcElementPtrAddr, PrivateLlvmPtrType),
+                                PrivatePtrType->castAs<PointerType>());
 
       // Step 1.2: Create a temporary to store the element in the destination
       // Reduce list.
@@ -1920,24 +1923,27 @@ static void emitReductionListCopy(
     case ThreadCopy: {
       // Step 1.1: Get the address for the src element in the Reduce list.
       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr,
-          C.getPointerType(Private->getType())->castAs<PointerType>());
+      SrcElementAddr =
+          CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
+                                    SrcElementPtrAddr, PrivateLlvmPtrType),
+                                PrivatePtrType->castAs<PointerType>());
 
       // Step 1.2: Get the address for dest element.  The destination
       // element has already been created on the thread's stack.
       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr = CGF.EmitLoadOfPointer(
-          DestElementPtrAddr,
-          C.getPointerType(Private->getType())->castAs<PointerType>());
+      DestElementAddr =
+          CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
+                                    DestElementPtrAddr, PrivateLlvmPtrType),
+                                PrivatePtrType->castAs<PointerType>());
       break;
     }
     case ThreadToScratchpad: {
       // Step 1.1: Get the address for the src element in the Reduce list.
       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr,
-          C.getPointerType(Private->getType())->castAs<PointerType>());
+      SrcElementAddr =
+          CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
+                                    SrcElementPtrAddr, PrivateLlvmPtrType),
+                                PrivatePtrType->castAs<PointerType>());
 
       // Step 1.2: Get the address for dest element:
       // address = base + index * ElementSizeInChars.

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 9e7158fabde65..d8f1519f315ff 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -108,9 +108,9 @@ int bar(int n){
 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
 //
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to double**
+// CHECK: [[ELT:%.+]] = load double*, double** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
 //
 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
@@ -159,11 +159,11 @@ int bar(int n){
 //
 // CHECK: [[DO_COPY]]
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to double**
+// CHECK: [[REMOTE_ELT:%.+]] = load double*, double** [[REMOTE_ELT_REF_CAST]],
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to double**
+// CHECK: [[ELT:%.+]] = load double*, double** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
@@ -329,9 +329,9 @@ int bar(int n){
 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
 //
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to float**
+// CHECK: [[ELT:%.+]] = load float*, float** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
 //
 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
@@ -387,11 +387,11 @@ int bar(int n){
 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
 //
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to float**
+// CHECK: [[REMOTE_ELT:%.+]] = load float*, float** [[REMOTE_ELT_REF_CAST]],
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to float**
+// CHECK: [[ELT:%.+]] = load float*, float** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
@@ -612,9 +612,9 @@ int bar(int n){
 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
 //
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i32**
+// CHECK: [[ELT:%.+]] = load i32*, i32** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
 //
 // CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
@@ -626,9 +626,9 @@ int bar(int n){
 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
 //
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i16**
+// CHECK: [[ELT:%.+]] = load i16*, i16** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
 //
 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
@@ -677,20 +677,20 @@ int bar(int n){
 //
 // CHECK: [[DO_COPY]]
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to i32**
+// CHECK: [[REMOTE_ELT:%.+]] = load i32*, i32** [[REMOTE_ELT_REF_CAST]],
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i32**
+// CHECK: [[ELT:%.+]] = load i32*, i32** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
 //
 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to i16**
+// CHECK: [[REMOTE_ELT:%.+]] = load i16*, i16** [[REMOTE_ELT_REF_CAST]],
 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i16**
+// CHECK: [[ELT:%.+]] = load i16*, i16** [[ELT_REF_CAST]],
 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
 // CHECK: br label {{%?}}[[COPY_CONT:.+]]

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index a06be48566e75..404f8e6c4a9c0 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -453,12 +453,12 @@ void test() {
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex"*
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex", %"class.std::complex"* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex"**
+// CHECK1-NEXT:    [[TMP11:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex", %"class.std::complex"* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast %"class.std::complex"* [[TMP13]] to i8*
-// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
+// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP11]] to i64*
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -468,7 +468,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -496,13 +496,13 @@ void test() {
 // CHECK1-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK1:       then4:
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
-// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
-// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to %"class.std::complex"*
-// CHECK1-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to %"class.std::complex"*
+// CHECK1-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to %"class.std::complex"**
+// CHECK1-NEXT:    [[TMP43:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to %"class.std::complex"**
+// CHECK1-NEXT:    [[TMP46:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP45]], align 8
 // CHECK1-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP46]] to i8*
-// CHECK1-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
+// CHECK1-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP43]] to i8*
 // CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 8, i1 false), !tbaa.struct !21
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
@@ -1020,12 +1020,12 @@ void test() {
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.0"*
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.0"**
+// CHECK1-NEXT:    [[TMP11:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast %"class.std::complex.0"* [[TMP13]] to i8*
-// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP12]] to i64*
+// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP11]] to i64*
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
 // CHECK1:       .shuffle.pre_cond:
@@ -1049,7 +1049,7 @@ void test() {
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK1:       .shuffle.exit:
 // CHECK1-NEXT:    [[TMP31:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP31]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store i8* [[TMP31]], i8** [[TMP12]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK1-NEXT:    [[TMP34:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -1077,13 +1077,13 @@ void test() {
 // CHECK1-NEXT:    br i1 [[TMP48]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK1:       then4:
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP50:%.*]] = load i8*, i8** [[TMP49]], align 8
-// CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8
-// CHECK1-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP50]] to %"class.std::complex.0"*
-// CHECK1-NEXT:    [[TMP54:%.*]] = bitcast i8* [[TMP52]] to %"class.std::complex.0"*
+// CHECK1-NEXT:    [[TMP50:%.*]] = bitcast i8** [[TMP49]] to %"class.std::complex.0"**
+// CHECK1-NEXT:    [[TMP51:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP50]], align 8
+// CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP53:%.*]] = bitcast i8** [[TMP52]] to %"class.std::complex.0"**
+// CHECK1-NEXT:    [[TMP54:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP53]], align 8
 // CHECK1-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP54]] to i8*
-// CHECK1-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
+// CHECK1-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP51]] to i8*
 // CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 16, i1 false), !tbaa.struct !27
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
@@ -1687,12 +1687,12 @@ void test() {
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex"*
-// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex", %"class.std::complex"* [[TMP12]], i64 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex"**
+// CHECK2-NEXT:    [[TMP11:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP10]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex", %"class.std::complex"* [[TMP11]], i64 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast %"class.std::complex"* [[TMP13]] to i8*
-// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
+// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP11]] to i64*
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -1702,7 +1702,7 @@ void test() {
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK2-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK2-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -1730,13 +1730,13 @@ void test() {
 // CHECK2-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK2:       then4:
 // CHECK2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
-// CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
-// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to %"class.std::complex"*
-// CHECK2-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to %"class.std::complex"*
+// CHECK2-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to %"class.std::complex"**
+// CHECK2-NEXT:    [[TMP43:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP42]], align 8
+// CHECK2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to %"class.std::complex"**
+// CHECK2-NEXT:    [[TMP46:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP45]], align 8
 // CHECK2-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP46]] to i8*
-// CHECK2-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
+// CHECK2-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP43]] to i8*
 // CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 8, i1 false), !tbaa.struct !21
 // CHECK2-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK2:       else5:
@@ -2254,12 +2254,12 @@ void test() {
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.0"*
-// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP12]], i64 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.0"**
+// CHECK2-NEXT:    [[TMP11:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP10]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP11]], i64 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast %"class.std::complex.0"* [[TMP13]] to i8*
-// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP12]] to i64*
+// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP11]] to i64*
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK2-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
 // CHECK2:       .shuffle.pre_cond:
@@ -2283,7 +2283,7 @@ void test() {
 // CHECK2-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK2:       .shuffle.exit:
 // CHECK2-NEXT:    [[TMP31:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP31]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    store i8* [[TMP31]], i8** [[TMP12]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK2-NEXT:    [[TMP33:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK2-NEXT:    [[TMP34:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -2311,13 +2311,13 @@ void test() {
 // CHECK2-NEXT:    br i1 [[TMP48]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK2:       then4:
 // CHECK2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP50:%.*]] = load i8*, i8** [[TMP49]], align 8
-// CHECK2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8
-// CHECK2-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP50]] to %"class.std::complex.0"*
-// CHECK2-NEXT:    [[TMP54:%.*]] = bitcast i8* [[TMP52]] to %"class.std::complex.0"*
+// CHECK2-NEXT:    [[TMP50:%.*]] = bitcast i8** [[TMP49]] to %"class.std::complex.0"**
+// CHECK2-NEXT:    [[TMP51:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP50]], align 8
+// CHECK2-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP53:%.*]] = bitcast i8** [[TMP52]] to %"class.std::complex.0"**
+// CHECK2-NEXT:    [[TMP54:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP53]], align 8
 // CHECK2-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP54]] to i8*
-// CHECK2-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
+// CHECK2-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP51]] to i8*
 // CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 16, i1 false), !tbaa.struct !27
 // CHECK2-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK2:       else5:
@@ -2921,12 +2921,12 @@ void test() {
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex"*
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex", %"class.std::complex"* [[TMP12]], i64 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex"**
+// CHECK3-NEXT:    [[TMP11:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP10]], align 8
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex", %"class.std::complex"* [[TMP11]], i64 1
 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast %"class.std::complex"* [[TMP13]] to i8*
-// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
+// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP11]] to i64*
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -2936,7 +2936,7 @@ void test() {
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK3-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -2964,13 +2964,13 @@ void test() {
 // CHECK3-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK3:       then4:
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
-// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
-// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to %"class.std::complex"*
-// CHECK3-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to %"class.std::complex"*
+// CHECK3-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to %"class.std::complex"**
+// CHECK3-NEXT:    [[TMP43:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP42]], align 8
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to %"class.std::complex"**
+// CHECK3-NEXT:    [[TMP46:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[TMP45]], align 8
 // CHECK3-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP46]] to i8*
-// CHECK3-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
+// CHECK3-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP43]] to i8*
 // CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 8, i1 false), !tbaa.struct !21
 // CHECK3-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK3:       else5:
@@ -3488,12 +3488,12 @@ void test() {
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.0"*
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP12]], i64 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.0"**
+// CHECK3-NEXT:    [[TMP11:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP10]], align 8
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP11]], i64 1
 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast %"class.std::complex.0"* [[TMP13]] to i8*
-// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP12]] to i64*
+// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP11]] to i64*
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK3-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
 // CHECK3:       .shuffle.pre_cond:
@@ -3517,7 +3517,7 @@ void test() {
 // CHECK3-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK3:       .shuffle.exit:
 // CHECK3-NEXT:    [[TMP31:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP31]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    store i8* [[TMP31]], i8** [[TMP12]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP33:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK3-NEXT:    [[TMP34:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -3545,13 +3545,13 @@ void test() {
 // CHECK3-NEXT:    br i1 [[TMP48]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK3:       then4:
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP50:%.*]] = load i8*, i8** [[TMP49]], align 8
-// CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8
-// CHECK3-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP50]] to %"class.std::complex.0"*
-// CHECK3-NEXT:    [[TMP54:%.*]] = bitcast i8* [[TMP52]] to %"class.std::complex.0"*
+// CHECK3-NEXT:    [[TMP50:%.*]] = bitcast i8** [[TMP49]] to %"class.std::complex.0"**
+// CHECK3-NEXT:    [[TMP51:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP50]], align 8
+// CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP53:%.*]] = bitcast i8** [[TMP52]] to %"class.std::complex.0"**
+// CHECK3-NEXT:    [[TMP54:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP53]], align 8
 // CHECK3-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP54]] to i8*
-// CHECK3-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
+// CHECK3-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP51]] to i8*
 // CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 16, i1 false), !tbaa.struct !27
 // CHECK3-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK3:       else5:

diff  --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index afde7e819e935..642530e70cf99 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -4291,12 +4291,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK1-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
-// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
+// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -4306,7 +4306,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8
+// CHECK1-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -4334,12 +4334,12 @@ int bar(int n){
 // CHECK1-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK1:       then4:
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
-// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
-// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
-// CHECK1-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
-// CHECK1-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK1-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK1-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
 // CHECK1-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
@@ -4632,12 +4632,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
 // CHECK1-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8
-// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
-// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i64 1
+// CHECK1-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to float**
+// CHECK1-NEXT:    [[TMP23:%.*]] = load float*, float** [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP23]], i64 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
-// CHECK1-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
+// CHECK1-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP23]] to i32*
 // CHECK1-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
 // CHECK1-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -4647,7 +4647,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i64 1
 // CHECK1-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP35]], i8** [[TMP23]], align 8
+// CHECK1-NEXT:    store i8* [[TMP35]], i8** [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -4681,12 +4681,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
 // CHECK1-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8
-// CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 8
-// CHECK1-NEXT:    [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
-// CHECK1-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
-// CHECK1-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
+// CHECK1-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to float**
+// CHECK1-NEXT:    [[TMP60:%.*]] = load float*, float** [[TMP59]], align 8
+// CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to float**
+// CHECK1-NEXT:    [[TMP63:%.*]] = load float*, float** [[TMP62]], align 8
+// CHECK1-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP60]], align 4
 // CHECK1-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
 // CHECK1-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK1:       else6:
@@ -5071,37 +5071,37 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
 // CHECK1-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
-// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 8
+// CHECK1-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 8
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8
-// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
-// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
+// CHECK1-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1
 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
 // CHECK1-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
 // CHECK1-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
 // CHECK1-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
-// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
 // CHECK1-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 8
+// CHECK1-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 8
 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -5129,20 +5129,20 @@ int bar(int n){
 // CHECK1-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
 // CHECK1:       then5:
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8
-// CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8
-// CHECK1-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
-// CHECK1-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
-// CHECK1-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
+// CHECK1-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 8
+// CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
 // CHECK1-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8
-// CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8
-// CHECK1-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
-// CHECK1-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
-// CHECK1-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
+// CHECK1-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK1-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 8
+// CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 8
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
 // CHECK1-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
 // CHECK1-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK1:       else6:
@@ -5249,37 +5249,37 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
 // CHECK1-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
-// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i64 1
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 8
+// CHECK1-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 8
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8
-// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
-// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
+// CHECK1-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1
 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
 // CHECK1-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
 // CHECK1-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
 // CHECK1-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
-// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
 // CHECK1-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 8
+// CHECK1-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 8
 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -5307,20 +5307,20 @@ int bar(int n){
 // CHECK1-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
 // CHECK1:       then5:
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8
-// CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8
-// CHECK1-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
-// CHECK1-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
-// CHECK1-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
+// CHECK1-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 8
+// CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
 // CHECK1-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8
-// CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
-// CHECK1-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8
-// CHECK1-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
-// CHECK1-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
-// CHECK1-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
+// CHECK1-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK1-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 8
+// CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 8
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
 // CHECK1-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
 // CHECK1-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK1:       else6:
@@ -5610,12 +5610,12 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
-// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK2-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i32 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
-// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
+// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -5625,7 +5625,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 4
+// CHECK2-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 4
 // CHECK2-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK2-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK2-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -5653,12 +5653,12 @@ int bar(int n){
 // CHECK2-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK2:       then4:
 // CHECK2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4
-// CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4
-// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
-// CHECK2-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
-// CHECK2-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
+// CHECK2-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK2-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK2-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 4
+// CHECK2-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
 // CHECK2-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
 // CHECK2-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK2:       else5:
@@ -5951,12 +5951,12 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
 // CHECK2-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
-// CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
-// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1
+// CHECK2-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to float**
+// CHECK2-NEXT:    [[TMP23:%.*]] = load float*, float** [[TMP22]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP23]], i32 1
 // CHECK2-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
-// CHECK2-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
+// CHECK2-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP23]] to i32*
 // CHECK2-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
 // CHECK2-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -5966,7 +5966,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
 // CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
 // CHECK2-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP35]], i8** [[TMP23]], align 4
+// CHECK2-NEXT:    store i8* [[TMP35]], i8** [[TMP24]], align 4
 // CHECK2-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -6000,12 +6000,12 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
 // CHECK2-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
 // CHECK2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
-// CHECK2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4
-// CHECK2-NEXT:    [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
-// CHECK2-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
-// CHECK2-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
+// CHECK2-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to float**
+// CHECK2-NEXT:    [[TMP60:%.*]] = load float*, float** [[TMP59]], align 4
+// CHECK2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to float**
+// CHECK2-NEXT:    [[TMP63:%.*]] = load float*, float** [[TMP62]], align 4
+// CHECK2-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP60]], align 4
 // CHECK2-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
 // CHECK2-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK2:       else6:
@@ -6389,37 +6389,37 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 // CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK2-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
 // CHECK2-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
-// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
+// CHECK2-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 4
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
-// CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
-// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK2-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
-// CHECK2-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
 // CHECK2-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
 // CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK2-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
 // CHECK2-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
 // CHECK2-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
 // CHECK2-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
-// CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
 // CHECK2-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
+// CHECK2-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 4
 // CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK2-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -6447,20 +6447,20 @@ int bar(int n){
 // CHECK2-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
 // CHECK2:       then5:
 // CHECK2-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
-// CHECK2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
-// CHECK2-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
-// CHECK2-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
-// CHECK2-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
+// CHECK2-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 4
+// CHECK2-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 4
+// CHECK2-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
 // CHECK2-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
 // CHECK2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
-// CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
-// CHECK2-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
-// CHECK2-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
-// CHECK2-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
+// CHECK2-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK2-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 4
+// CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 4
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
 // CHECK2-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
 // CHECK2-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK2:       else6:
@@ -6567,37 +6567,37 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 // CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK2-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
 // CHECK2-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
-// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
+// CHECK2-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 4
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
-// CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
-// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK2-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
-// CHECK2-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
 // CHECK2-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
 // CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK2-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
 // CHECK2-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
 // CHECK2-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
 // CHECK2-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
-// CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
 // CHECK2-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
+// CHECK2-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 4
 // CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK2-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -6625,20 +6625,20 @@ int bar(int n){
 // CHECK2-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
 // CHECK2:       then5:
 // CHECK2-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
-// CHECK2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
-// CHECK2-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
-// CHECK2-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
-// CHECK2-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
+// CHECK2-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 4
+// CHECK2-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 4
+// CHECK2-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
 // CHECK2-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
 // CHECK2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
-// CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
-// CHECK2-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
-// CHECK2-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
-// CHECK2-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
+// CHECK2-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK2-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 4
+// CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 4
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
 // CHECK2-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
 // CHECK2-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK2:       else6:
@@ -6928,12 +6928,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK3-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i32 1
 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
-// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
+// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -6943,7 +6943,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 4
+// CHECK3-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK3-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -6971,12 +6971,12 @@ int bar(int n){
 // CHECK3-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK3:       then4:
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4
-// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4
-// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
-// CHECK3-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
-// CHECK3-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
+// CHECK3-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK3-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK3-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 4
+// CHECK3-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
 // CHECK3-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
 // CHECK3-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK3:       else5:
@@ -7269,12 +7269,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
 // CHECK3-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
-// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
-// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1
+// CHECK3-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to float**
+// CHECK3-NEXT:    [[TMP23:%.*]] = load float*, float** [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP23]], i32 1
 // CHECK3-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
-// CHECK3-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
+// CHECK3-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP23]] to i32*
 // CHECK3-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
 // CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -7284,7 +7284,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
 // CHECK3-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP35]], i8** [[TMP23]], align 4
+// CHECK3-NEXT:    store i8* [[TMP35]], i8** [[TMP24]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -7318,12 +7318,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
 // CHECK3-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
-// CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4
-// CHECK3-NEXT:    [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
-// CHECK3-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
-// CHECK3-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
+// CHECK3-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to float**
+// CHECK3-NEXT:    [[TMP60:%.*]] = load float*, float** [[TMP59]], align 4
+// CHECK3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to float**
+// CHECK3-NEXT:    [[TMP63:%.*]] = load float*, float** [[TMP62]], align 4
+// CHECK3-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP60]], align 4
 // CHECK3-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
 // CHECK3-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK3:       else6:
@@ -7707,37 +7707,37 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK3-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
 // CHECK3-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
-// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
+// CHECK3-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
-// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
-// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK3-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
-// CHECK3-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
+// CHECK3-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
 // CHECK3-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK3-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
 // CHECK3-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
 // CHECK3-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
 // CHECK3-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
-// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
 // CHECK3-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
+// CHECK3-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -7765,20 +7765,20 @@ int bar(int n){
 // CHECK3-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
 // CHECK3:       then5:
 // CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
-// CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
-// CHECK3-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
-// CHECK3-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
-// CHECK3-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
+// CHECK3-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK3-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 4
+// CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK3-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 4
+// CHECK3-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
 // CHECK3-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
-// CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
-// CHECK3-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
-// CHECK3-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
-// CHECK3-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
+// CHECK3-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK3-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 4
+// CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK3-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 4
+// CHECK3-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
 // CHECK3-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
 // CHECK3-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK3:       else6:
@@ -7885,37 +7885,37 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK3-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
 // CHECK3-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
-// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
+// CHECK3-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
-// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
-// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK3-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
-// CHECK3-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
+// CHECK3-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
 // CHECK3-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK3-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
 // CHECK3-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
 // CHECK3-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
 // CHECK3-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
-// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
 // CHECK3-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
+// CHECK3-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -7943,20 +7943,20 @@ int bar(int n){
 // CHECK3-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
 // CHECK3:       then5:
 // CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
-// CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
-// CHECK3-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
-// CHECK3-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
-// CHECK3-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
+// CHECK3-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK3-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 4
+// CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK3-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 4
+// CHECK3-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
 // CHECK3-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
-// CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
-// CHECK3-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
-// CHECK3-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
-// CHECK3-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
+// CHECK3-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK3-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 4
+// CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK3-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 4
+// CHECK3-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
 // CHECK3-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
 // CHECK3-NEXT:    br label [[IFCONT7:%.*]]
 // CHECK3:       else6:

diff  --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index b734703ac07de..0b4082fc18698 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -185,12 +185,12 @@ int main()
 // CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
 // CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
-// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
-// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i64 1
 // CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
-// CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
 // CHECK-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
@@ -200,7 +200,7 @@ int main()
 // CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
 // CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8
+// CHECK-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 8
 // CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
 // CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
 // CHECK-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
@@ -228,12 +228,12 @@ int main()
 // CHECK-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK:       then4:
 // CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
-// CHECK-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 8
+// CHECK-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
 // CHECK-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
 // CHECK-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK:       else5:


        


More information about the cfe-commits mailing list